Skip to content

Commit

Permalink
tec: Handle feeds declaring a wrong encoding
Browse files Browse the repository at this point in the history
  • Loading branch information
marienfressinaud committed Sep 9, 2022
1 parent fe2e227 commit 0cb9840
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 1 deletion.
21 changes: 20 additions & 1 deletion lib/SpiderBits/src/feeds/Feed.php
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,26 @@ public static function fromText($feed_as_string)
$dom_document = new \DOMDocument();
$result = @$dom_document->loadXML($feed_as_string);
if (!$result) {
throw new \DomainException('Can’t parse the given string.');
// It might be an encoding issue. We try to recover by re-encoding
// the string with the declared encoding, or UTF-8. It will most
// probably generate a string with characters replaced by `?`, but
// at least it will be parsable.
$result = preg_match(
'/<?xml\s+(?:(?:.*?)\s)?encoding="(.+?)"/i',
$feed_as_string,
$matches
);
if ($result) {
$encoding = $matches[1];
} else {
$encoding = 'UTF-8';
}
$feed_as_string = mb_convert_encoding($feed_as_string, $encoding, $encoding);
$result = @$dom_document->loadXML($feed_as_string);

if (!$result) {
throw new \DomainException('Can’t parse the given string.');
}
}

if (AtomParser::canHandle($dom_document)) {
Expand Down
22 changes: 22 additions & 0 deletions tests/lib/SpiderBits/feeds/FeedTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,28 @@ public function testFromTextWithEmptyRss()
$this->assertSame('4a022608d595e9000d1f1be22a0a6a0763ad853d2417b1c8ea0ea12bd047bdcd', $feed->hash());
}

public function testFromTextRecoversFromWrongEncoding()
{
// Create a XML string declaring encoding UTF-8
$xml = <<<XML
<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0">
<channel>
<title>My feed with an àccent</title>
<link>https://example.com</link>
</channel>
</rss>
XML;
// … but its real encoding is ISO-8859-1!
$xml = mb_convert_encoding($xml, 'ISO-8859-1', 'UTF-8');

$feed = Feed::fromText($xml);

$this->assertSame('My feed with an ?ccent', $feed->title);
$this->assertSame('https://example.com', $feed->link);
$this->assertSame('0ca0efc2ed6d8bfab90dbd3f42a473465c505b3d13b4da5975b0deed52c4b231', $feed->hash());
}

public function testFromTextFailsWithEmptyString()
{
$this->expectException(\DomainException::class);
Expand Down

0 comments on commit 0cb9840

Please sign in to comment.