diff --git a/bundle/Command/ConvertXmlTextToRichTextCommand.php b/bundle/Command/ConvertXmlTextToRichTextCommand.php index ffea3c33..efffff61 100644 --- a/bundle/Command/ConvertXmlTextToRichTextCommand.php +++ b/bundle/Command/ConvertXmlTextToRichTextCommand.php @@ -64,7 +64,13 @@ protected function configure() 'disable-duplicate-id-check', null, InputOption::VALUE_NONE, - 'Disable the check for duplicate html ids in every attribute. This might increase execution time on large databases' + 'Disable the check for duplicate html ids in every attribute. This might decrease execution time on large databases' + ) + ->addOption( + 'disable-id-value-check', + null, + InputOption::VALUE_NONE, + 'Disable the check for non-validating id/name values. This might decrease execution time on large databases' ) ->addOption( 'test-content-object', @@ -123,7 +129,7 @@ protected function execute(InputInterface $input, OutputInterface $output) $dryRun = true; } - $this->convertFields($dryRun, $testContentId, !$input->getOption('disable-duplicate-id-check'), $output); + $this->convertFields($dryRun, $testContentId, !$input->getOption('disable-duplicate-id-check'), !$input->getOption('disable-id-value-check'), $output); } protected function getContentTypeIds($contentTypeIdentifiers) @@ -330,7 +336,7 @@ protected function updateFieldRow($dryRun, $id, $version, $datatext) } } - protected function convertFields($dryRun, $contentId, $checkDuplicateIds, OutputInterface $output) + protected function convertFields($dryRun, $contentId, $checkDuplicateIds, $checkIdValues, OutputInterface $output) { $count = $this->getRowCountOfContentObjectAttributes('ezxmltext', $contentId); @@ -345,7 +351,7 @@ protected function convertFields($dryRun, $contentId, $checkDuplicateIds, Output $inputValue = $row['data_text']; } - $converted = $this->converter->convert($this->createDocument($inputValue), $checkDuplicateIds, $row['id']); + $converted = $this->converter->convert($this->createDocument($inputValue), $checkDuplicateIds, $checkIdValues, $row['id']); $this->updateFieldRow($dryRun, $row['id'], $row['version'], $converted); diff --git a/lib/FieldType/XmlText/Converter/RichText.php b/lib/FieldType/XmlText/Converter/RichText.php index a6e7c7db..48001378 100644 --- a/lib/FieldType/XmlText/Converter/RichText.php +++ b/lib/FieldType/XmlText/Converter/RichText.php @@ -157,9 +157,44 @@ protected function reportNonUniqueIds(DOMDocument $document, $contentFieldId) $id = $node->attributes->getNamedItem('id')->nodeValue; // id has format "duplicated_id_foo_bar_idm45226413447104" where "foo_bar" is the duplicated id $duplicatedId = substr($id, strlen('duplicated_id_'), strrpos($id, '_') - strlen('duplicated_id_')); - if ($this->logger !== null) { - $this->logger->warning("Duplicated id in original ezxmltext for contentobject_attribute.id=$contentFieldId, automatically generated new id : $duplicatedId --> $id"); - } + $this->logger->warning("Duplicated id in original ezxmltext for contentobject_attribute.id=$contentFieldId, automatically generated new id : $duplicatedId --> $id"); + } + } + + protected function validateAttributeValues(DOMDocument $document, $contentFieldId) + { + $xpath = new DOMXPath($document); + $whitelist1st = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_'; + $replaceStr1st = 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'; + + $whitelist = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-'; + $replaceStr = ''; + /* + * We want to pick elements which has id value + * #1 not starting with a..z or '_' + * #2 not a..z, '0..9', '_' or '-' after 1st character + * So, no xpath v2 to our disposal... + * 1st line : we check the 1st char(substring) in id, converts it to 'a' if it in whitelist(translate), then check if it string now starts with 'a'(starts-with), then we invert result(not) + * : So we replace first char with 'a' if it is whitelisted, then we select the element if id value does not start with 'a' + * 2nd line: now we check remaining(omit 1st char) part of string (substring), removes any character that *is* whitelisted(translate), then check if there are any non-whitelisted characters left(string-lenght) + * 3rd line: Due to the not() in 1st line, we pick all elements not matching that 1st line. That also includes elements not having a xml:id at all.. + * : So, we want to make sure we only pick elements which has a xml:id attribute. + */ + $nodes = $xpath->query("//*[ + ( + not(starts-with(translate(substring(@xml:id, 1, 1), '$whitelist1st', '$replaceStr1st'), 'a')) + or string-length(translate(substring(@xml:id, 2), '$whitelist', '$replaceStr')) > 0 + ) and string-length(@xml:id) > 0]"); + + if ($contentFieldId === null) { + $contentFieldId = '[unknown]'; + } + foreach ($nodes as $node) { + $orgValue = $node->attributes->getNamedItem('id')->nodeValue; + $newValue = 'rewrite_' . $node->attributes->getNamedItem('id')->nodeValue; + $newValue = preg_replace("/[^$whitelist]/", '_', $newValue); + $node->attributes->getNamedItem('id')->nodeValue = $newValue; + $this->logger->warning("Replaced non-validating id value in richtext for contentobject_attribute.id=$contentFieldId, changed from : $orgValue --> $newValue"); } } @@ -317,10 +352,11 @@ protected function checkEmptyEmbedTags(DOMDocument $inputDocument) * * @param DOMDocument $inputDocument * @param bool $checkDuplicateIds + * @param bool $checkIdValues * @param null|int $contentFieldId * @return string */ - public function convert(DOMDocument $inputDocument, $checkDuplicateIds = false, $contentFieldId = null) + public function convert(DOMDocument $inputDocument, $checkDuplicateIds = false, $checkIdValues = false, $contentFieldId = null) { $this->removeComments($inputDocument); @@ -329,6 +365,9 @@ public function convert(DOMDocument $inputDocument, $checkDuplicateIds = false, if ($checkDuplicateIds) { $this->reportNonUniqueIds($convertedDocument, $contentFieldId); } + if ($checkIdValues) { + $this->validateAttributeValues($convertedDocument, $contentFieldId); + } // Needed by some disabled output escaping (eg. legacy ezxml paragraph elements) $convertedDocumentNormalized = new DOMDocument(); @@ -339,7 +378,7 @@ public function convert(DOMDocument $inputDocument, $checkDuplicateIds = false, $result = $convertedDocumentNormalized->saveXML(); - if (!empty($errors) && $this->logger !== null) { + if (!empty($errors)) { $this->logger->error( "Validation errors when converting ezxmltext for contentobject_attribute.id=$contentFieldId", ['result' => $result, 'errors' => $errors, 'xmlString' => $inputDocument->saveXML()] diff --git a/tests/lib/FieldType/Converter/RichTextTest.php b/tests/lib/FieldType/Converter/RichTextTest.php index 56cc6b88..4644414a 100644 --- a/tests/lib/FieldType/Converter/RichTextTest.php +++ b/tests/lib/FieldType/Converter/RichTextTest.php @@ -137,7 +137,7 @@ public function testConvert($inputFilePath, $outputFilePath) $richText = new RichText($apiRepositoryStub); $richText->setImageContentTypes([27]); - $result = $richText->convert($inputDocument, true); + $result = $richText->convert($inputDocument, true, true); $convertedDocument = $this->createDocument($result, false); $expectedDocument = $this->createDocument($outputFilePath); diff --git a/tests/lib/FieldType/Converter/_fixtures/richtext/input/041-anchor_invalid_id.xml b/tests/lib/FieldType/Converter/_fixtures/richtext/input/041-anchor_invalid_id.xml new file mode 100644 index 00000000..b2743b38 --- /dev/null +++ b/tests/lib/FieldType/Converter/_fixtures/richtext/input/041-anchor_invalid_id.xml @@ -0,0 +1,33 @@ + +
+ Here is an anchor + + + Here is an anchor + + + Here is an anchor + + + Here is an anchor + + + Here is an anchor + + + Here is an anchor + + + Here is an anchor + + + Here is an anchor + + + Here is an anchor + + +
diff --git a/tests/lib/FieldType/Converter/_fixtures/richtext/output/041-anchor_invalid_id.xml b/tests/lib/FieldType/Converter/_fixtures/richtext/output/041-anchor_invalid_id.xml new file mode 100644 index 00000000..889158bc --- /dev/null +++ b/tests/lib/FieldType/Converter/_fixtures/richtext/output/041-anchor_invalid_id.xml @@ -0,0 +1,34 @@ + +
+ Here is an anchor + + + Here is an anchor + + + Here is an anchor + + + Here is an anchor + + + Here is an anchor + + + Here is an anchor + + + Here is an anchor + + + Here is an anchor + + + Here is an anchor + + +