Skip to content

Commit

Permalink
#38 Support for supplementary unicode characters in XMLWriter.
Browse files Browse the repository at this point in the history
  • Loading branch information
FilipJirsak committed Jul 1, 2018
1 parent 351bfef commit 75e59b1
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 31 deletions.
51 changes: 20 additions & 31 deletions src/main/java/org/dom4j/io/XMLWriter.java
Expand Up @@ -1645,44 +1645,37 @@ protected String escapeElementEntities(String text) {

for (i = 0; i < size; i++) {
String entity = null;
char c = text.charAt(i);

int c = text.codePointAt(i);
switch (c) {
case '<':
entity = "&lt;";

break;

case '>':
entity = "&gt;";

break;

case '&':
entity = "&amp;";

break;

case '\t':
case '\n':
case '\r':

// don't encode standard whitespace characters
if (preserve) {
entity = String.valueOf(c);
}

break;

default:

if ((c < 32) || shouldEncodeChar(c)) {
entity = "&#" + (int) c + ";";
entity = "&#" + c + ";";
}

break;
}


if (entity != null) {
if (block == null) {
block = text.toCharArray();
Expand All @@ -1691,6 +1684,12 @@ protected String escapeElementEntities(String text) {
buffer.append(block, last, i - last);
buffer.append(entity);
last = i + 1;
if (Character.isSupplementaryCodePoint(c)) {
last++;
}
}
if (Character.isSupplementaryCodePoint(c)) {
i++;
}
}

Expand Down Expand Up @@ -1739,53 +1738,37 @@ protected String escapeAttributeEntities(String text) {

for (i = 0; i < size; i++) {
String entity = null;
char c = text.charAt(i);
int c = text.codePointAt(i);

switch (c) {
case '<':
entity = "&lt;";

break;

case '>':
entity = "&gt;";

break;

case '\'':

if (quote == '\'') {
entity = "&apos;";
}

break;

case '\"':

if (quote == '\"') {
entity = "&quot;";
}

break;

case '&':
entity = "&amp;";

break;

case '\t':
case '\n':
case '\r':

// don't encode standard whitespace characters
break;

default:

if ((c < 32) || shouldEncodeChar(c)) {
entity = "&#" + (int) c + ";";
entity = "&#" + c + ";";
}

break;
}

Expand All @@ -1797,6 +1780,12 @@ protected String escapeAttributeEntities(String text) {
buffer.append(block, last, i - last);
buffer.append(entity);
last = i + 1;
if(Character.isSupplementaryCodePoint(c)) {
last++;
}
}
if(Character.isSupplementaryCodePoint(c)) {
i++;
}
}

Expand All @@ -1822,15 +1811,15 @@ protected String escapeAttributeEntities(String text) {
* Should the given character be escaped. This depends on the encoding of
* the document.
*
* @param c
* @param codepoint Unicode codepoint.
* DOCUMENT ME!
*
* @return boolean
*/
protected boolean shouldEncodeChar(char c) {
protected boolean shouldEncodeChar(int codepoint) {
int max = getMaximumAllowedCharacter();

return (max > 0) && (c > max);
return (max > 0) && (codepoint > max);
}

/**
Expand Down
56 changes: 56 additions & 0 deletions src/test/java/org/dom4j/XMLWriterTest.java
Expand Up @@ -631,6 +631,62 @@ public void testElementNamespaceAttributesWriteOpen() throws IOException {
Assert.assertEquals(stringWriter.toString(), "<rss xmlns:g=\"http://base.google.com/ns/1.0\" xmlns:c=\"http://base.google.com/cns/1.0\" nons=\"value\" g:ns=\"value\">");
}

public void testPenguin() throws IOException {
// U+1F427 PENGUIN
final String penguin = "\ud83d\udc27";

Document document = DocumentHelper.createDocument();
document.addElement("doc").setText(penguin);

OutputFormat outputFormat = OutputFormat.createCompactFormat();
outputFormat.setSuppressDeclaration(true);

StringWriter stringWriter = new StringWriter();
XMLWriter writer = new XMLWriter(stringWriter, outputFormat);
writer.write(document);
writer.close();

Assert.assertEquals(stringWriter.toString(), "<doc>"+penguin+"</doc>");
}

public void testSurrogatePairElement() throws IOException {
// U+1F427 PENGUIN
final String penguin = "\ud83d\udc27";

Document document = DocumentHelper.createDocument();
document.addElement("doc").setText(penguin);

OutputFormat outputFormat = OutputFormat.createCompactFormat();
outputFormat.setSuppressDeclaration(true);
outputFormat.setEncoding("US-ASCII");

StringWriter stringWriter = new StringWriter();
XMLWriter writer = new XMLWriter(stringWriter, outputFormat);
writer.write(document);
writer.close();

Assert.assertEquals(stringWriter.toString(), "<doc>&#128039;</doc>");
}

public void testSurrogatePairAttribute() throws IOException {
// U+1F427 PENGUIN
final String penguin = "\ud83d\udc27";

Document document = DocumentHelper.createDocument();
document.addElement("doc").addAttribute("penguin", penguin);

OutputFormat outputFormat = OutputFormat.createCompactFormat();
outputFormat.setSuppressDeclaration(true);
outputFormat.setEncoding("US-ASCII");

StringWriter stringWriter = new StringWriter();
XMLWriter writer = new XMLWriter(stringWriter, outputFormat);
writer.write(document);
writer.close();

Assert.assertEquals(stringWriter.toString(), "<doc penguin=\"&#128039;\"/>");
}

protected void generateXML(ContentHandler handler) throws SAXException {
handler.startDocument();

Expand Down

0 comments on commit 75e59b1

Please sign in to comment.