Skip to content

Commit

Permalink
Merge pull request #12 from elecena/fix/entities-handling
Browse files Browse the repository at this point in the history
Improve XML entities handling
  • Loading branch information
macbre committed Jan 15, 2024
2 parents a89ca71 + 542337e commit 8da89c3
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 9 deletions.
24 changes: 20 additions & 4 deletions src/XMLParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ class XMLParser implements \Iterator
private ?string $currentTagName = null;
private array $currentTagAttributes = [];

private string $currentTagContent = '';

/**
* The stack of the XML node names as go deeper into the tree.
*
Expand Down Expand Up @@ -84,6 +86,8 @@ public function startXML(\XMLParser $parser, string $tagName, array $attributes)
$this->currentTagName = $tagName;
$this->currentTagAttributes = $attributes;

$this->currentTagContent = '';

// append to the queue of items to iterate over
$this->nodesQueue[] = new Nodes\XMLNodeOpen(
name: $this->currentTagName,
Expand All @@ -94,19 +98,31 @@ public function startXML(\XMLParser $parser, string $tagName, array $attributes)
$this->nodeNamesStack[] = $tagName;
}

/**
* The XML parser "emits" separate characters when the node has the content with XML entities.
*
* For instance: <loc>https://example.com/index.html?ACTION=1004&amp;SITE=3</loc>
*
* Would emit: 'https://example.com/index.html?ACTION=1004', '&' and 'SITE=3' separately.
*
* So, just accumulate the characters as we're getting them and "emit" the XMLNodeContent instance
* when the node is closed.
*/
public function charXML(\XMLParser $parser, string $tagContent): void
{
$this->currentTagContent .= $tagContent;
}

public function endXML(\XMLParser $parser, string $tagName): void
{
// append to the queue of items to iterate over
$this->nodesQueue[] = new Nodes\XMLNodeContent(
name: $this->currentTagName,
attributes: $this->currentTagAttributes,
content: $tagContent,
content: $this->currentTagContent,
parentName: array_slice($this->nodeNamesStack, -2, 1)[0] ?: null
);
}

public function endXML(\XMLParser $parser, string $tagName): void
{
// Pop the node name off the end of stack
array_pop($this->nodeNamesStack);

Expand Down
26 changes: 26 additions & 0 deletions tests/XMLParserEntitiesTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
<?php

use Elecena\XmlIterator\Nodes\XMLNodeContent;
use Elecena\XmlIterator\Nodes\XMLNodeOpen;
use Elecena\XmlIterator\Nodes\XMLNodeClose;

class XMLParserEntitiesTest extends XMLParserTestCase
{
protected function getParserStream()
{
return fopen(__DIR__ . '/fixtures/sitemap-entities.xml', mode: 'rt');
}

public function testParsesTheLocNodesWithAmpersands(): void
{
$locations = [];

foreach($this->getParser()->iterateByNodeContent('loc') as $item) {
$locations[] = $item->content;
}

$this->assertCount(8, $locations);
$this->assertEquals('https://www.reichelt.com/index.html?ACTION=1004&SITE=1', $locations[0]);
$this->assertEquals('https://www.reichelt.com/magazin/en/sitemap.xml', $locations[7]);
}
}
8 changes: 3 additions & 5 deletions tests/XMLParserTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,9 @@ public function testParsesTheLocNodes(): void
{
$locations = [];

foreach($this->getParser() as $item) {
if ($item instanceof XMLNodeContent && $item->name === 'loc') {
$locations[] = $item->content;
$this->assertEquals('sitemap', $item->parentName);
}
foreach($this->getParser()->iterateByNodeContent('loc') as $item) {
$locations[] = $item->content;
$this->assertEquals('sitemap', $item->parentName);
}

$this->assertCount(8, $locations);
Expand Down
28 changes: 28 additions & 0 deletions tests/fixtures/sitemap-entities.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
<?xml version="1.0" encoding="UTF-8"?>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<!-- https://www.reichelt.com/sitemap.xml -->
<sitemap>
<loc>https://www.reichelt.com/index.html?ACTION=1004&amp;SITE=1</loc>
</sitemap>
<sitemap>
<loc>https://www.reichelt.com/index.html?ACTION=1004&amp;SITE=2</loc>
</sitemap>
<sitemap>
<loc>https://www.reichelt.com/index.html?ACTION=1004&amp;SITE=3</loc>
</sitemap>
<sitemap>
<loc>https://www.reichelt.com/index.html?ACTION=1004&amp;SITE=4</loc>
</sitemap>
<sitemap>
<loc>https://www.reichelt.de/magazin/sitemap.xml</loc>
</sitemap>
<sitemap>
<loc>https://www.reichelt.com/magazin/fr/sitemap.xml</loc>
</sitemap>
<sitemap>
<loc>https://www.reichelt.com/magazin/nl/sitemap.xml</loc>
</sitemap>
<sitemap>
<loc>https://www.reichelt.com/magazin/en/sitemap.xml</loc>
</sitemap>
</sitemapindex>

0 comments on commit 8da89c3

Please sign in to comment.