Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
namespace Flow\ETL\Adapter\XML;

use function Flow\ETL\DSL\array_to_rows;
use function Flow\Types\DSL\type_string;
use Flow\ETL\Exception\RuntimeException;
use Flow\ETL\Extractor\{FileExtractor, Limitable, LimitableExtractor, PathFiltering, Signal};
use Flow\ETL\{Extractor, FlowContext, Schema};
Expand Down Expand Up @@ -32,6 +33,11 @@ final class XMLParserExtractor implements Extractor, FileExtractor, LimitableExt
*/
private array $elements = [];

/**
* @var list<array<string, string>>
*/
private array $namespaceStack = [];

private ?\XMLParser $parser = null;

private ?Schema $schema = null;
Expand Down Expand Up @@ -79,6 +85,7 @@ public function endElementHandler(\XMLParser $parser, string $name) : void
}

array_pop($this->currentPath);
array_pop($this->namespaceStack);
}

public function extract(FlowContext $context) : \Generator
Expand Down Expand Up @@ -165,21 +172,39 @@ public function source() : Path
public function startElementHandler(\XMLParser $parser, string $name, array $attrs) : void
{
$this->currentPath[] = $name;
$currentPathString = implode('/', $this->currentPath);

if ($currentPathString === $this->xmlNodePath || ($this->xmlNodePath === '' && \count($this->currentPath) === 1)) {
$this->capturing = true;
$this->writer()->startElement($name);
$namespaceDeclarations = [];
$otherAttributes = [];

foreach ($attrs as $key => $value) {
$this->writer()->writeAttribute($key, \is_scalar($value) ? (string) $value : '');
foreach ($attrs as $key => $value) {
if ($key === 'xmlns' || str_starts_with($key, 'xmlns:')) {
$namespaceDeclarations[$key] = \is_scalar($value) ? type_string()->cast($value) : '';
} else {
$otherAttributes[$key] = $value;
}
}

$this->namespaceStack[] = $namespaceDeclarations;

$isCapturedRoot = implode('/', $this->currentPath) === $this->xmlNodePath || ($this->xmlNodePath === '' && \count($this->currentPath) === 1);

if ($isCapturedRoot) {
$this->capturing = true;
$namespacesToEmit = array_merge(...$this->namespaceStack);
} elseif ($this->capturing) {
$this->writer()->startElement($name);
$namespacesToEmit = $namespaceDeclarations;
} else {
return;
}

foreach ($attrs as $key => $value) {
$this->writer()->writeAttribute($key, \is_scalar($value) ? (string) $value : '');
}
$this->writer()->startElement($name);

foreach ($namespacesToEmit as $nsKey => $nsValue) {
$this->writer()->writeAttribute($nsKey, $nsValue);
}

foreach ($otherAttributes as $key => $value) {
$this->writer()->writeAttribute($key, \is_scalar($value) ? type_string()->cast($value) : '');
}
}

Expand Down Expand Up @@ -221,6 +246,9 @@ private function freeParser() : void
xml_parser_free($this->parser);
$this->parser = null;
}

$this->namespaceStack = [];
$this->currentPath = [];
}

private function parser() : \XMLParser
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
<?xml version='1.0' encoding='UTF-8'?>
<feed xmlns="http://example.com/default">
<entry>
<title>Product 1</title>
</entry>
</feed>
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<?xml version='1.0' encoding='UTF-8'?>
<feed xmlns:g="http://base.google.com/ns/1.0" xmlns:c="http://example.com/custom">
<entry>
<g:id>1</g:id>
<g:title>Product 1</g:title>
<c:tag>alpha</c:tag>
</entry>
<entry>
<g:id>2</g:id>
<g:title>Product 2</g:title>
<c:tag>beta</c:tag>
</entry>
</feed>
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<?xml version='1.0' encoding='UTF-8'?>
<feed xmlns:a="http://example.com/a">
<group xmlns:b="http://example.com/b">
<entry>
<a:alpha>A</a:alpha>
<b:beta>B</b:beta>
</entry>
</group>
</feed>
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<?xml version='1.0' encoding='UTF-8'?>
<feed xmlns:g="http://base.google.com/ns/1.0">
<group xmlns:g="http://example.com/override" xmlns:x="http://example.com/extra">
<entry>
<g:id>1</g:id>
<x:note>hello</x:note>
</entry>
</group>
</feed>
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
<?xml version='1.0' encoding='UTF-8'?>
<feed>
<entry xmlns:g="http://base.google.com/ns/1.0">
<g:title>Product 1</g:title>
</entry>
</feed>
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
<?xml version='1.0' encoding='UTF-8'?>
<feed xmlns:g="http://base.google.com/ns/1.0">
<entry xml:lang="en" g:priority="high">
<g:title>Product 1</g:title>
</entry>
</feed>
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<?xml version='1.0' encoding='UTF-8'?>
<feed>
<entry xmlns:g="http://example.com/first">
<g:note>first</g:note>
</entry>
<entry>
<plain>second</plain>
</entry>
</feed>
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

use function Flow\ETL\Adapter\XML\from_xml;
use function Flow\ETL\DSL\config;
use function Flow\ETL\DSL\{df, flow_context, schema, xml_schema};
use function Flow\ETL\DSL\{df, flow_context, ref, schema, xml_schema};
use function Flow\Filesystem\DSL\path_real;
use function Flow\Types\DSL\type_string;
use Flow\ETL\Extractor\Signal;
Expand Down Expand Up @@ -111,6 +111,71 @@ public function test_reading_xml_from_path() : void
);
}

public function test_reading_xml_with_ancestor_namespace_declaration() : void
{
$rows = df()
->read(from_xml(__DIR__ . '/../Fixtures/namespaced_feed.xml', 'feed/entry'))
->withEntry('title', ref('node')->xpath('/entry/g:title')->domElementValue())
->fetch();

self::assertSame(
['Product 1', 'Product 2'],
[$rows[0]->valueOf('title'), $rows[1]->valueOf('title')],
);

$node = type_string()->cast($rows[0]->valueOf('node'));

self::assertStringContainsString('xmlns:g="http://base.google.com/ns/1.0"', $node);
self::assertStringContainsString('xmlns:c="http://example.com/custom"', $node);
}

public function test_reading_xml_with_default_namespace_declaration() : void
{
$node = type_string()->cast(df()
->read(from_xml(__DIR__ . '/../Fixtures/namespaced_default.xml', 'feed/entry'))
->fetch()[0]
->valueOf('node'));

self::assertStringContainsString('xmlns="http://example.com/default"', $node);
}

public function test_reading_xml_with_multi_ancestor_namespace_merge() : void
{
$node = type_string()->cast(df()
->read(from_xml(__DIR__ . '/../Fixtures/namespaced_multi_ancestor.xml', 'feed/group/entry'))
->fetch()[0]
->valueOf('node'));

self::assertStringContainsString('xmlns:a="http://example.com/a"', $node);
self::assertStringContainsString('xmlns:b="http://example.com/b"', $node);
}

public function test_reading_xml_with_namespace_on_captured_root() : void
{
$rows = df()
->read(from_xml(__DIR__ . '/../Fixtures/namespaced_on_captured_root.xml', 'feed/entry'))
->withEntry('title', ref('node')->xpath('/entry/g:title')->domElementValue())
->fetch();

self::assertSame('Product 1', $rows[0]->valueOf('title'));
self::assertStringContainsString(
'xmlns:g="http://base.google.com/ns/1.0"',
type_string()->cast($rows[0]->valueOf('node')),
);
}

public function test_reading_xml_with_prefixed_attribute() : void
{
$node = type_string()->cast(df()
->read(from_xml(__DIR__ . '/../Fixtures/namespaced_prefixed_attribute.xml', 'feed/entry'))
->fetch()[0]
->valueOf('node'));

self::assertStringContainsString('xmlns:g="http://base.google.com/ns/1.0"', $node);
self::assertStringContainsString('xml:lang="en"', $node);
self::assertStringContainsString('g:priority="high"', $node);
}

public function test_reading_xml_with_schema() : void
{
$rows = df()
Expand All @@ -133,6 +198,50 @@ public function test_reading_xml_with_schema() : void
}
}

public function test_reading_xml_with_shadowed_namespace_declaration() : void
{
$node = type_string()->cast(df()
->read(from_xml(__DIR__ . '/../Fixtures/namespaced_nested.xml', 'feed/group/entry'))
->fetch()[0]
->valueOf('node'));

self::assertStringContainsString('xmlns:g="http://example.com/override"', $node);
self::assertStringContainsString('xmlns:x="http://example.com/extra"', $node);
self::assertStringNotContainsString('http://base.google.com/ns/1.0', $node);
}

public function test_reading_xml_with_sibling_namespace_isolation() : void
{
$rows = df()
->read(from_xml(__DIR__ . '/../Fixtures/namespaced_sibling_isolation.xml', 'feed/entry'))
->fetch()
->toArray();

$firstNode = type_string()->cast($rows[0]['node']);
$secondNode = type_string()->cast($rows[1]['node']);

self::assertStringContainsString('xmlns:g="http://example.com/first"', $firstNode);
self::assertStringNotContainsString('xmlns:g', $secondNode);
self::assertStringNotContainsString('http://example.com/first', $secondNode);
}

public function test_reading_xml_without_namespaces_is_unchanged() : void
{
self::assertXmlStringEqualsXmlString(
<<<'XML'
<item item_attribute_01="1">
<id id_attribute_01="1">1</id>
</item>
XML,
type_string()->cast(
df()
->read(from_xml(__DIR__ . '/../Fixtures/simple_items_flat.xml', 'root/items/item'))
->fetch()[0]
->valueOf('node')
)
);
}

public function test_signal_stop() : void
{
$extractor = (from_xml(path_real(__DIR__ . '/../Fixtures/flow_orders.xml')))->withXMLNodePath('root/row');
Expand Down
Loading