From 7dc1d667bf2e0c1095ed04fbb33645bf789b4955 Mon Sep 17 00:00:00 2001 From: Norbert Orzechowicz Date: Fri, 24 Apr 2026 23:00:52 +0200 Subject: [PATCH 1/2] fix: preserve XML namespace declarations when capturing subtrees --- .../ETL/Adapter/XML/XMLParserExtractor.php | 48 ++++++-- .../XML/Tests/Fixtures/namespaced_default.xml | 6 + .../XML/Tests/Fixtures/namespaced_feed.xml | 13 ++ .../XML/Tests/Fixtures/namespaced_nested.xml | 9 ++ .../Integration/XMLParserExtractorTest.php | 111 +++++++++++++++++- 5 files changed, 176 insertions(+), 11 deletions(-) create mode 100644 src/adapter/etl-adapter-xml/tests/Flow/ETL/Adapter/XML/Tests/Fixtures/namespaced_default.xml create mode 100644 src/adapter/etl-adapter-xml/tests/Flow/ETL/Adapter/XML/Tests/Fixtures/namespaced_feed.xml create mode 100644 src/adapter/etl-adapter-xml/tests/Flow/ETL/Adapter/XML/Tests/Fixtures/namespaced_nested.xml diff --git a/src/adapter/etl-adapter-xml/src/Flow/ETL/Adapter/XML/XMLParserExtractor.php b/src/adapter/etl-adapter-xml/src/Flow/ETL/Adapter/XML/XMLParserExtractor.php index 251e7ed556..c8a60b8056 100644 --- a/src/adapter/etl-adapter-xml/src/Flow/ETL/Adapter/XML/XMLParserExtractor.php +++ b/src/adapter/etl-adapter-xml/src/Flow/ETL/Adapter/XML/XMLParserExtractor.php @@ -5,6 +5,7 @@ namespace Flow\ETL\Adapter\XML; use function Flow\ETL\DSL\array_to_rows; +use function Flow\Types\DSL\type_string; use Flow\ETL\Exception\RuntimeException; use Flow\ETL\Extractor\{FileExtractor, Limitable, LimitableExtractor, PathFiltering, Signal}; use Flow\ETL\{Extractor, FlowContext, Schema}; @@ -32,6 +33,11 @@ final class XMLParserExtractor implements Extractor, FileExtractor, LimitableExt */ private array $elements = []; + /** + * @var list> + */ + private array $namespaceStack = []; + private ?\XMLParser $parser = null; private ?Schema $schema = null; @@ -79,6 +85,7 @@ public function endElementHandler(\XMLParser $parser, string $name) : void } array_pop($this->currentPath); + array_pop($this->namespaceStack); } public function extract(FlowContext $context) : \Generator @@ -165,21 +172,39 @@ public function source() : Path public function startElementHandler(\XMLParser $parser, string $name, array $attrs) : void { $this->currentPath[] = $name; - $currentPathString = implode('/', $this->currentPath); - if ($currentPathString === $this->xmlNodePath || ($this->xmlNodePath === '' && \count($this->currentPath) === 1)) { - $this->capturing = true; - $this->writer()->startElement($name); + $namespaceDeclarations = []; + $otherAttributes = []; - foreach ($attrs as $key => $value) { - $this->writer()->writeAttribute($key, \is_scalar($value) ? (string) $value : ''); + foreach ($attrs as $key => $value) { + if ($key === 'xmlns' || str_starts_with($key, 'xmlns:')) { + $namespaceDeclarations[$key] = \is_scalar($value) ? type_string()->cast($value) : ''; + } else { + $otherAttributes[$key] = $value; } + } + + $this->namespaceStack[] = $namespaceDeclarations; + + $isCapturedRoot = implode('/', $this->currentPath) === $this->xmlNodePath || ($this->xmlNodePath === '' && \count($this->currentPath) === 1); + + if ($isCapturedRoot) { + $this->capturing = true; + $namespacesToEmit = array_merge(...$this->namespaceStack); } elseif ($this->capturing) { - $this->writer()->startElement($name); + $namespacesToEmit = $namespaceDeclarations; + } else { + return; + } - foreach ($attrs as $key => $value) { - $this->writer()->writeAttribute($key, \is_scalar($value) ? (string) $value : ''); - } + $this->writer()->startElement($name); + + foreach ($namespacesToEmit as $nsKey => $nsValue) { + $this->writer()->writeAttribute($nsKey, $nsValue); + } + + foreach ($otherAttributes as $key => $value) { + $this->writer()->writeAttribute($key, \is_scalar($value) ? type_string()->cast($value) : ''); } } @@ -221,6 +246,9 @@ private function freeParser() : void xml_parser_free($this->parser); $this->parser = null; } + + $this->namespaceStack = []; + $this->currentPath = []; } private function parser() : \XMLParser diff --git a/src/adapter/etl-adapter-xml/tests/Flow/ETL/Adapter/XML/Tests/Fixtures/namespaced_default.xml b/src/adapter/etl-adapter-xml/tests/Flow/ETL/Adapter/XML/Tests/Fixtures/namespaced_default.xml new file mode 100644 index 0000000000..dc90636bbd --- /dev/null +++ b/src/adapter/etl-adapter-xml/tests/Flow/ETL/Adapter/XML/Tests/Fixtures/namespaced_default.xml @@ -0,0 +1,6 @@ + + + + Product 1 + + diff --git a/src/adapter/etl-adapter-xml/tests/Flow/ETL/Adapter/XML/Tests/Fixtures/namespaced_feed.xml b/src/adapter/etl-adapter-xml/tests/Flow/ETL/Adapter/XML/Tests/Fixtures/namespaced_feed.xml new file mode 100644 index 0000000000..14ef820925 --- /dev/null +++ b/src/adapter/etl-adapter-xml/tests/Flow/ETL/Adapter/XML/Tests/Fixtures/namespaced_feed.xml @@ -0,0 +1,13 @@ + + + + 1 + Product 1 + alpha + + + 2 + Product 2 + beta + + diff --git a/src/adapter/etl-adapter-xml/tests/Flow/ETL/Adapter/XML/Tests/Fixtures/namespaced_nested.xml b/src/adapter/etl-adapter-xml/tests/Flow/ETL/Adapter/XML/Tests/Fixtures/namespaced_nested.xml new file mode 100644 index 0000000000..a14077a18b --- /dev/null +++ b/src/adapter/etl-adapter-xml/tests/Flow/ETL/Adapter/XML/Tests/Fixtures/namespaced_nested.xml @@ -0,0 +1,9 @@ + + + + + 1 + hello + + + diff --git a/src/adapter/etl-adapter-xml/tests/Flow/ETL/Adapter/XML/Tests/Integration/XMLParserExtractorTest.php b/src/adapter/etl-adapter-xml/tests/Flow/ETL/Adapter/XML/Tests/Integration/XMLParserExtractorTest.php index 135d95f7df..3265c6356d 100644 --- a/src/adapter/etl-adapter-xml/tests/Flow/ETL/Adapter/XML/Tests/Integration/XMLParserExtractorTest.php +++ b/src/adapter/etl-adapter-xml/tests/Flow/ETL/Adapter/XML/Tests/Integration/XMLParserExtractorTest.php @@ -6,7 +6,7 @@ use function Flow\ETL\Adapter\XML\from_xml; use function Flow\ETL\DSL\config; -use function Flow\ETL\DSL\{df, flow_context, schema, xml_schema}; +use function Flow\ETL\DSL\{df, flow_context, ref, schema, xml_schema}; use function Flow\Filesystem\DSL\path_real; use function Flow\Types\DSL\type_string; use Flow\ETL\Extractor\Signal; @@ -111,6 +111,71 @@ public function test_reading_xml_from_path() : void ); } + public function test_reading_xml_with_ancestor_namespace_declaration() : void + { + $rows = df() + ->read(from_xml(__DIR__ . '/../Fixtures/namespaced_feed.xml', 'feed/entry')) + ->withEntry('title', ref('node')->xpath('/entry/g:title')->domElementValue()) + ->fetch(); + + self::assertSame( + ['Product 1', 'Product 2'], + [$rows[0]->valueOf('title'), $rows[1]->valueOf('title')], + ); + + $node = type_string()->cast($rows[0]->valueOf('node')); + + self::assertStringContainsString('xmlns:g="http://base.google.com/ns/1.0"', $node); + self::assertStringContainsString('xmlns:c="http://example.com/custom"', $node); + } + + public function test_reading_xml_with_default_namespace_declaration() : void + { + $node = type_string()->cast(df() + ->read(from_xml(__DIR__ . '/../Fixtures/namespaced_default.xml', 'feed/entry')) + ->fetch()[0] + ->valueOf('node')); + + self::assertStringContainsString('xmlns="http://example.com/default"', $node); + } + + public function test_reading_xml_with_multi_ancestor_namespace_merge() : void + { + $node = type_string()->cast(df() + ->read(from_xml(__DIR__ . '/../Fixtures/namespaced_multi_ancestor.xml', 'feed/group/entry')) + ->fetch()[0] + ->valueOf('node')); + + self::assertStringContainsString('xmlns:a="http://example.com/a"', $node); + self::assertStringContainsString('xmlns:b="http://example.com/b"', $node); + } + + public function test_reading_xml_with_namespace_on_captured_root() : void + { + $rows = df() + ->read(from_xml(__DIR__ . '/../Fixtures/namespaced_on_captured_root.xml', 'feed/entry')) + ->withEntry('title', ref('node')->xpath('/entry/g:title')->domElementValue()) + ->fetch(); + + self::assertSame('Product 1', $rows[0]->valueOf('title')); + self::assertStringContainsString( + 'xmlns:g="http://base.google.com/ns/1.0"', + type_string()->cast($rows[0]->valueOf('node')), + ); + } + + public function test_reading_xml_with_prefixed_attribute() : void + { + $node = type_string()->cast(df() + ->read(from_xml(__DIR__ . '/../Fixtures/namespaced_prefixed_attribute.xml', 'feed/entry')) + ->fetch()[0] + ->valueOf('node')); + + self::assertStringContainsString('xmlns:g="http://base.google.com/ns/1.0"', $node); + self::assertStringContainsString('xml:lang="en"', $node); + self::assertStringContainsString('g:priority="high"', $node); + } + public function test_reading_xml_with_schema() : void { $rows = df() @@ -133,6 +198,50 @@ public function test_reading_xml_with_schema() : void } } + public function test_reading_xml_with_shadowed_namespace_declaration() : void + { + $node = type_string()->cast(df() + ->read(from_xml(__DIR__ . '/../Fixtures/namespaced_nested.xml', 'feed/group/entry')) + ->fetch()[0] + ->valueOf('node')); + + self::assertStringContainsString('xmlns:g="http://example.com/override"', $node); + self::assertStringContainsString('xmlns:x="http://example.com/extra"', $node); + self::assertStringNotContainsString('http://base.google.com/ns/1.0', $node); + } + + public function test_reading_xml_with_sibling_namespace_isolation() : void + { + $rows = df() + ->read(from_xml(__DIR__ . '/../Fixtures/namespaced_sibling_isolation.xml', 'feed/entry')) + ->fetch() + ->toArray(); + + $firstNode = type_string()->cast($rows[0]['node']); + $secondNode = type_string()->cast($rows[1]['node']); + + self::assertStringContainsString('xmlns:g="http://example.com/first"', $firstNode); + self::assertStringNotContainsString('xmlns:g', $secondNode); + self::assertStringNotContainsString('http://example.com/first', $secondNode); + } + + public function test_reading_xml_without_namespaces_is_unchanged() : void + { + self::assertXmlStringEqualsXmlString( + <<<'XML' + + 1 + +XML, + type_string()->cast( + df() + ->read(from_xml(__DIR__ . '/../Fixtures/simple_items_flat.xml', 'root/items/item')) + ->fetch()[0] + ->valueOf('node') + ) + ); + } + public function test_signal_stop() : void { $extractor = (from_xml(path_real(__DIR__ . '/../Fixtures/flow_orders.xml')))->withXMLNodePath('root/row'); From 3a56bea0faf4fbfeeb43839355585008a641deac Mon Sep 17 00:00:00 2001 From: Norbert Orzechowicz Date: Fri, 24 Apr 2026 23:21:21 +0200 Subject: [PATCH 2/2] fix: commit missing test files --- .../XML/Tests/Fixtures/namespaced_multi_ancestor.xml | 9 +++++++++ .../XML/Tests/Fixtures/namespaced_on_captured_root.xml | 6 ++++++ .../XML/Tests/Fixtures/namespaced_prefixed_attribute.xml | 6 ++++++ .../XML/Tests/Fixtures/namespaced_sibling_isolation.xml | 9 +++++++++ 4 files changed, 30 insertions(+) create mode 100644 src/adapter/etl-adapter-xml/tests/Flow/ETL/Adapter/XML/Tests/Fixtures/namespaced_multi_ancestor.xml create mode 100644 src/adapter/etl-adapter-xml/tests/Flow/ETL/Adapter/XML/Tests/Fixtures/namespaced_on_captured_root.xml create mode 100644 src/adapter/etl-adapter-xml/tests/Flow/ETL/Adapter/XML/Tests/Fixtures/namespaced_prefixed_attribute.xml create mode 100644 src/adapter/etl-adapter-xml/tests/Flow/ETL/Adapter/XML/Tests/Fixtures/namespaced_sibling_isolation.xml diff --git a/src/adapter/etl-adapter-xml/tests/Flow/ETL/Adapter/XML/Tests/Fixtures/namespaced_multi_ancestor.xml b/src/adapter/etl-adapter-xml/tests/Flow/ETL/Adapter/XML/Tests/Fixtures/namespaced_multi_ancestor.xml new file mode 100644 index 0000000000..6be249144f --- /dev/null +++ b/src/adapter/etl-adapter-xml/tests/Flow/ETL/Adapter/XML/Tests/Fixtures/namespaced_multi_ancestor.xml @@ -0,0 +1,9 @@ + + + + + A + B + + + diff --git a/src/adapter/etl-adapter-xml/tests/Flow/ETL/Adapter/XML/Tests/Fixtures/namespaced_on_captured_root.xml b/src/adapter/etl-adapter-xml/tests/Flow/ETL/Adapter/XML/Tests/Fixtures/namespaced_on_captured_root.xml new file mode 100644 index 0000000000..f450492b75 --- /dev/null +++ b/src/adapter/etl-adapter-xml/tests/Flow/ETL/Adapter/XML/Tests/Fixtures/namespaced_on_captured_root.xml @@ -0,0 +1,6 @@ + + + + Product 1 + + diff --git a/src/adapter/etl-adapter-xml/tests/Flow/ETL/Adapter/XML/Tests/Fixtures/namespaced_prefixed_attribute.xml b/src/adapter/etl-adapter-xml/tests/Flow/ETL/Adapter/XML/Tests/Fixtures/namespaced_prefixed_attribute.xml new file mode 100644 index 0000000000..f12442989a --- /dev/null +++ b/src/adapter/etl-adapter-xml/tests/Flow/ETL/Adapter/XML/Tests/Fixtures/namespaced_prefixed_attribute.xml @@ -0,0 +1,6 @@ + + + + Product 1 + + diff --git a/src/adapter/etl-adapter-xml/tests/Flow/ETL/Adapter/XML/Tests/Fixtures/namespaced_sibling_isolation.xml b/src/adapter/etl-adapter-xml/tests/Flow/ETL/Adapter/XML/Tests/Fixtures/namespaced_sibling_isolation.xml new file mode 100644 index 0000000000..f18d1dc420 --- /dev/null +++ b/src/adapter/etl-adapter-xml/tests/Flow/ETL/Adapter/XML/Tests/Fixtures/namespaced_sibling_isolation.xml @@ -0,0 +1,9 @@ + + + + first + + + second + +