diff --git a/src/main/java/org/dom4j/io/XMLWriter.java b/src/main/java/org/dom4j/io/XMLWriter.java index 6dbb46ac..198825dc 100644 --- a/src/main/java/org/dom4j/io/XMLWriter.java +++ b/src/main/java/org/dom4j/io/XMLWriter.java @@ -1645,44 +1645,37 @@ protected String escapeElementEntities(String text) { for (i = 0; i < size; i++) { String entity = null; - char c = text.charAt(i); + int c = text.codePointAt(i); switch (c) { case '<': entity = "<"; - break; - case '>': entity = ">"; - break; - case '&': entity = "&"; - break; - case '\t': case '\n': case '\r': - // don't encode standard whitespace characters if (preserve) { entity = String.valueOf(c); } - break; default: if ((c < 32) || shouldEncodeChar(c)) { - entity = "&#" + (int) c + ";"; + entity = "&#" + c + ";"; } break; } + if (entity != null) { if (block == null) { block = text.toCharArray(); @@ -1691,6 +1684,12 @@ protected String escapeElementEntities(String text) { buffer.append(block, last, i - last); buffer.append(entity); last = i + 1; + if (Character.isSupplementaryCodePoint(c)) { + last++; + } + } + if (Character.isSupplementaryCodePoint(c)) { + i++; } } @@ -1739,53 +1738,37 @@ protected String escapeAttributeEntities(String text) { for (i = 0; i < size; i++) { String entity = null; - char c = text.charAt(i); + int c = text.codePointAt(i); switch (c) { case '<': entity = "<"; - break; - case '>': entity = ">"; - break; - case '\'': - if (quote == '\'') { entity = "'"; } - break; - case '\"': - if (quote == '\"') { entity = """; } - break; - case '&': entity = "&"; - break; - case '\t': case '\n': case '\r': - // don't encode standard whitespace characters break; - default: - if ((c < 32) || shouldEncodeChar(c)) { - entity = "&#" + (int) c + ";"; + entity = "&#" + c + ";"; } - break; } @@ -1797,6 +1780,12 @@ protected String escapeAttributeEntities(String text) { buffer.append(block, last, i - last); buffer.append(entity); last = i + 1; + if(Character.isSupplementaryCodePoint(c)) { + last++; + } + } + if(Character.isSupplementaryCodePoint(c)) { + i++; } } @@ -1822,15 +1811,15 @@ protected String escapeAttributeEntities(String text) { * Should the given character be escaped. This depends on the encoding of * the document. * - * @param c + * @param codepoint Unicode codepoint. * DOCUMENT ME! * * @return boolean */ - protected boolean shouldEncodeChar(char c) { + protected boolean shouldEncodeChar(int codepoint) { int max = getMaximumAllowedCharacter(); - return (max > 0) && (c > max); + return (max > 0) && (codepoint > max); } /** diff --git a/src/test/java/org/dom4j/XMLWriterTest.java b/src/test/java/org/dom4j/XMLWriterTest.java index cee1380e..397e6253 100644 --- a/src/test/java/org/dom4j/XMLWriterTest.java +++ b/src/test/java/org/dom4j/XMLWriterTest.java @@ -631,6 +631,62 @@ public void testElementNamespaceAttributesWriteOpen() throws IOException { Assert.assertEquals(stringWriter.toString(), ""); } + public void testPenguin() throws IOException { + // U+1F427 PENGUIN + final String penguin = "\ud83d\udc27"; + + Document document = DocumentHelper.createDocument(); + document.addElement("doc").setText(penguin); + + OutputFormat outputFormat = OutputFormat.createCompactFormat(); + outputFormat.setSuppressDeclaration(true); + + StringWriter stringWriter = new StringWriter(); + XMLWriter writer = new XMLWriter(stringWriter, outputFormat); + writer.write(document); + writer.close(); + + Assert.assertEquals(stringWriter.toString(), ""+penguin+""); + } + + public void testSurrogatePairElement() throws IOException { + // U+1F427 PENGUIN + final String penguin = "\ud83d\udc27"; + + Document document = DocumentHelper.createDocument(); + document.addElement("doc").setText(penguin); + + OutputFormat outputFormat = OutputFormat.createCompactFormat(); + outputFormat.setSuppressDeclaration(true); + outputFormat.setEncoding("US-ASCII"); + + StringWriter stringWriter = new StringWriter(); + XMLWriter writer = new XMLWriter(stringWriter, outputFormat); + writer.write(document); + writer.close(); + + Assert.assertEquals(stringWriter.toString(), "🐧"); + } + + public void testSurrogatePairAttribute() throws IOException { + // U+1F427 PENGUIN + final String penguin = "\ud83d\udc27"; + + Document document = DocumentHelper.createDocument(); + document.addElement("doc").addAttribute("penguin", penguin); + + OutputFormat outputFormat = OutputFormat.createCompactFormat(); + outputFormat.setSuppressDeclaration(true); + outputFormat.setEncoding("US-ASCII"); + + StringWriter stringWriter = new StringWriter(); + XMLWriter writer = new XMLWriter(stringWriter, outputFormat); + writer.write(document); + writer.close(); + + Assert.assertEquals(stringWriter.toString(), ""); + } + protected void generateXML(ContentHandler handler) throws SAXException { handler.startDocument();