Skip to content

Commit

Permalink
Catch URL syntax errors + cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
fivefilters committed Apr 20, 2024
1 parent a00d35c commit ca965c2
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 28 deletions.
11 changes: 7 additions & 4 deletions src/Nodes/NodeTrait.php
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,11 @@
use DOMNodeList;

/**
* @method \DOMNode removeAttribute($name)
* @property ?DOMNode $firstChild
* @property ?DOMNode $lastChild
* @property ?DOMNode $parentNode
* @property ?DOMNode $nextSibling
* @property ?DOMNode $previousSibling
*/
trait NodeTrait
{
Expand Down Expand Up @@ -525,7 +529,6 @@ public function isWhitespace()
*/
public function shiftingAwareGetElementsByTagName($tag)
{
/** @var $nodes DOMNodeList */
$nodes = $this->getElementsByTagName($tag);
$count = $nodes->length;

Expand All @@ -547,13 +550,13 @@ public function shiftingAwareGetElementsByTagName($tag)
* Mimics JS's firstElementChild property. PHP only has firstChild which could be any type of DOMNode. Use this
* function to get the first one that is an DOMElement node.
*
* @return \DOMElement|null
* @return DOMElement|null
*/
public function getFirstElementChild()
{
if ($this->childNodes instanceof \Traversable) {
foreach ($this->childNodes as $node) {
if ($node instanceof \DOMElement) {
if ($node instanceof DOMElement) {
return $node;
}
}
Expand Down
4 changes: 2 additions & 2 deletions src/Nodes/NodeUtility.php
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ public static function nextNode($node)
* Changes the node tag name. Since tagName on DOMElement is a read only value, this must be done creating a new
* element with the new tag name and importing it to the main DOMDocument.
*
* @param DOMNode $node
* @param DOMNode|DOMElement $node
* @param string $value
* @param bool $importAttributes
*
Expand Down Expand Up @@ -136,7 +136,7 @@ public static function removeNode($node)
* Returns the next node. First checks for children (if the flag allows it), then for siblings, and finally
* for parents.
*
* @param DOMNode $originalNode
* @param DOMNode|DOMElement $originalNode
* @param bool $ignoreSelfAndKids
*
* @return DOMNode
Expand Down
63 changes: 41 additions & 22 deletions src/Readability.php
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ public function parse(?string $html = null)
*
* @param string $html
*/
public function loadHTML(string $html)
public function loadHTML(string $html): void
{
$this->logger->debug('[Loading] Loading HTML...');

Expand Down Expand Up @@ -378,7 +378,7 @@ private function getJSONLD(DOMDocument $dom): array
if (
!isset($parsed['@context']) ||
!is_string($parsed['@context']) ||
!preg_match('/^https?\:\/\/schema\.org$/', $parsed['@context'])
!preg_match('/^https?:\/\/schema\.org$/', $parsed['@context'])
) {
return $metadata;
}
Expand Down Expand Up @@ -459,7 +459,7 @@ private function getMetadata()
$propertyPattern = '/\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|image|site_name)(?!:)\s*/i';

// name is a single value
$namePattern = '/^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|image|site_name)(?!:)\s*$/i';
$namePattern = '/^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[.:]\s*)?(author|creator|description|title|image|site_name)(?!:)\s*$/i';

// Find description tags.
foreach ($this->dom->getElementsByTagName('meta') as $meta) {
Expand Down Expand Up @@ -576,10 +576,8 @@ private function getMetadata()

/**
* Returns all the images of the parsed article.
*
* @return array
*/
public function getImages()
public function getImages(): array
{
$result = [];
if ($this->getImage()) {
Expand All @@ -598,13 +596,15 @@ public function getImages()

if ($this->configuration->getFixRelativeURLs()) {
foreach ($result as &$imgSrc) {
$imgSrc = $this->toAbsoluteURI($imgSrc);
try {
$imgSrc = $this->toAbsoluteURI($imgSrc);
} catch (\Exception $err) {
// Invalid URL, leave as is
}
}
}

$result = array_unique(array_filter($result));

return $result;
return array_unique(array_filter($result));
}

/**
Expand Down Expand Up @@ -634,7 +634,11 @@ public function getMainImage()
}

if (!empty($imgUrl) && $this->configuration->getFixRelativeURLs()) {
$this->setImage($this->toAbsoluteURI($imgUrl));
try {
$this->setImage($this->toAbsoluteURI($imgUrl));
} catch (\Exception $err) {
// Invalid URL, leave as is
}
}
}

Expand All @@ -651,6 +655,7 @@ private function simplifyNestedElements(DOMDocument $article)

while ($node) {
if ($node->parentNode && in_array($node->nodeName, ['div', 'section']) && !($node->hasAttribute('id') && strpos($node->getAttribute('id'), 'readability') === 0)) {
/** @var DOMElement $node */
if ($node->isElementWithoutContent()) {
$node = NodeUtility::removeAndGetNext($node);
continue;
Expand Down Expand Up @@ -702,9 +707,9 @@ private function getArticleTitle()
* Sanity warning: if you eval this match in PHPStorm's "Evaluate expression" box, it will return false
* I can assure you it works properly if you let the code run.
*/
if (preg_match('/ [\|\-\\\\\/>»] /i', $curTitle)) {
if (preg_match('/ [|\-\\\\\/>»] /i', $curTitle)) {
$titleHadHierarchicalSeparators = (bool) preg_match('/ [\\\\\/>»] /', $curTitle);
$curTitle = preg_replace('/(.*)[\|\-\\\\\/>»] .*/i', '$1', $originalTitle);
$curTitle = preg_replace('/(.*)[|\-\\\\\/>»] .*/i', '$1', $originalTitle);

$this->logger->info(sprintf('[Metadata] Found hierarchical separators in title, new title is: \'%s\'', $curTitle));

Expand Down Expand Up @@ -787,7 +792,7 @@ private function toAbsoluteURI($uri)
$uri = trim($uri);

// If this is already an absolute URI, return it.
if (preg_match('/^[a-zA-Z][a-zA-Z0-9\+\-\.]*:/', $uri)) {
if (preg_match('/^[a-zA-Z][a-zA-Z0-9+\-.]*:/', $uri)) {
return $uri;
}

Expand Down Expand Up @@ -955,6 +960,7 @@ private function getNodes($node)
$p->appendChild($childNode);
}
} elseif ($p !== null) {
/** @var DOMNode $p */
while ($p->lastChild && $p->lastChild->isWhitespace()) {
$p->removeChild($p->lastChild);
}
Expand Down Expand Up @@ -1210,7 +1216,7 @@ private function removeScripts(DOMDocument $dom)
*
* @param DOMDocument $dom
*/
private function prepDocument(DOMDocument $dom)
private function prepDocument(DOMDocument $dom): void
{
$this->logger->info('[PrepDocument] Preparing document for parsing...');

Expand Down Expand Up @@ -1244,6 +1250,7 @@ private function prepDocument(DOMDocument $dom)
*/

if ($replaced) {
/** @var DOMElement $p */
$p = $dom->createElement('p');
$br->parentNode->replaceChild($p, $br);

Expand Down Expand Up @@ -1682,7 +1689,7 @@ public function prepArticle(DOMDocument $article)

// Remove single-cell tables
foreach ($article->shiftingAwareGetElementsByTagName('table') as $table) {
/** @var DOMNode $table */
/** @var DOMElement $table */
$tbody = $table->hasSingleTagInsideElement('tbody') ? $table->getFirstElementChild() : $table;
if ($tbody->hasSingleTagInsideElement('tr')) {
$row = $tbody->getFirstElementChild();
Expand Down Expand Up @@ -2233,20 +2240,32 @@ public function postProcessContent(DOMDocument $article)
if ($src) {
$this->logger->debug(sprintf('[PostProcess] Converting image URL to absolute URI: \'%s\'', substr($src, 0, 128)));

$media->setAttribute('src', $this->toAbsoluteURI($src));
try {
$media->setAttribute('src', $this->toAbsoluteURI($src));
} catch (\Exception $err) {
$this->logger->debug('[PostProcess] Invalid URL encountered');
}
}

if ($poster) {
$this->logger->debug(sprintf('[PostProcess] Converting image URL to absolute URI: \'%s\'', substr($poster, 0, 128)));

$media->setAttribute('poster', $this->toAbsoluteURI($poster));
try {
$media->setAttribute('poster', $this->toAbsoluteURI($poster));
} catch (\Exception $err) {
$this->logger->debug('[PostProcess] Invalid URL encountered');
}

}

if ($srcset) {
$newSrcset = preg_replace_callback(NodeUtility::$regexps['srcsetUrl'], function ($matches) {
$this->logger->debug(sprintf('[PostProcess] Converting image URL to absolute URI: \'%s\'', substr($matches[1], 0, 128)));

return $this->toAbsoluteURI($matches[1]) . $matches[2] . $matches[3];
try {
return $this->toAbsoluteURI($matches[1]) . $matches[2] . $matches[3];
} catch (\Exception $err) {
return $matches[1] . $matches[2] . $matches[3];
}
}, $srcset);

$media->setAttribute('srcset', $newSrcset);
Expand All @@ -2267,8 +2286,8 @@ public function postProcessContent(DOMDocument $article)
* Iterate over a NodeList, and return the first node that passes
* the supplied test function
*
* @param NodeList nodeList The NodeList.
* @param Function fn The test function.
* @param array nodeList The NodeList.
* @param callable fn The test function.
* @return DOMNode|null
*/
private function findNode(array $nodeList, callable $fn)
Expand Down

0 comments on commit ca965c2

Please sign in to comment.