From 515d26f543b30f8d098bffd3af989f82a2aca0a1 Mon Sep 17 00:00:00 2001 From: Daniel Silva Date: Thu, 9 Apr 2026 15:41:47 -0500 Subject: [PATCH 1/5] fix(rest-api): preserve Unicode characters in uploaded filenames (#35266) Jersey decodes multipart Content-Disposition filenames as ISO-8859-1, mangling non-ASCII characters. Re-interpret the bytes as UTF-8, normalize to NFC, then delegate to FileUtil.sanitizeFileName() for illegal-char removal. Refs: #35266 Co-Authored-By: Claude Sonnet 4.6 --- .../rest/api/v1/temp/TempFileResource.java | 12 +++++++++-- .../api/v1/temp/TempFileResourceTest.java | 20 +++++++++++++++++++ 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/dotCMS/src/main/java/com/dotcms/rest/api/v1/temp/TempFileResource.java b/dotCMS/src/main/java/com/dotcms/rest/api/v1/temp/TempFileResource.java index 1d832364db10..4aaa3b685fd4 100644 --- a/dotCMS/src/main/java/com/dotcms/rest/api/v1/temp/TempFileResource.java +++ b/dotCMS/src/main/java/com/dotcms/rest/api/v1/temp/TempFileResource.java @@ -22,6 +22,7 @@ import com.dotmarketing.exception.DoesNotExistException; import com.dotmarketing.portlets.workflows.business.WorkflowAPI; import com.dotmarketing.util.Config; +import com.dotmarketing.util.FileUtil; import com.dotmarketing.util.Logger; import com.dotmarketing.util.PageMode; import com.dotmarketing.util.UtilMethods; @@ -45,6 +46,8 @@ import org.glassfish.jersey.server.JSONP; import org.jetbrains.annotations.NotNull; +import java.nio.charset.StandardCharsets; +import java.text.Normalizer; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import javax.ws.rs.*; @@ -222,8 +225,13 @@ private void saveMultipleBinary(final FormDataMultiPart body, final HttpServletR } private static @NotNull String sanitizeFileName(ContentDisposition meta) { - final String sanitize = meta.getFileName().replaceAll("[^\\x00-\\x7F]", StringPool.BLANK); - return sanitize; + // Jersey decodes multipart Content-Disposition filenames as ISO-8859-1. + // Re-interpret those bytes as UTF-8 to recover the original filename, + // then normalize to NFC for consistent Unicode representation. + final String raw = meta.getFileName(); + final String utf8Name = new String(raw.getBytes(StandardCharsets.ISO_8859_1), StandardCharsets.UTF_8); + final String nfcName = Normalizer.normalize(utf8Name, Normalizer.Form.NFC); + return FileUtil.sanitizeFileName(nfcName); } private void printResponseEntityViewResult(final OutputStream outputStream, diff --git a/dotcms-integration/src/test/java/com/dotcms/rest/api/v1/temp/TempFileResourceTest.java b/dotcms-integration/src/test/java/com/dotcms/rest/api/v1/temp/TempFileResourceTest.java index 8570b15b1773..e3b08aad3175 100644 --- a/dotcms-integration/src/test/java/com/dotcms/rest/api/v1/temp/TempFileResourceTest.java +++ b/dotcms-integration/src/test/java/com/dotcms/rest/api/v1/temp/TempFileResourceTest.java @@ -6,6 +6,8 @@ import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; +import java.nio.charset.StandardCharsets; + import com.dotcms.api.web.HttpServletRequestThreadLocal; import com.dotcms.contenttype.model.field.BinaryField; import com.dotcms.contenttype.model.field.Field; @@ -172,6 +174,24 @@ public void test_temp_resource_upload() throws IOException { assertTrue(dotTempFileOpt.get().length() > 0); } + @Test + public void test_temp_resource_upload_preserves_unicode_filename() throws IOException { + resetTempResourceConfig(); + Config.setProperty(TempFileAPI.TEMP_RESOURCE_ALLOW_ANONYMOUS, true); + + // Jersey decodes multipart Content-Disposition filenames as ISO-8859-1. + // Simulate what a macOS browser sends: NFD UTF-8 bytes re-interpreted as ISO-8859-1. + final String expectedFileName = "Test_document_``$$#ääöüÄÖÜ.txt"; + final String jerseyEncodedName = new String( + expectedFileName.getBytes(StandardCharsets.UTF_8), StandardCharsets.ISO_8859_1); + + final HttpServletRequest request = mockRequest(); + final DotTempFile dotTempFile = saveTempFile_usingTempResource(jerseyEncodedName, request); + + assertEquals("Unicode characters must be preserved in the uploaded filename", + expectedFileName, dotTempFile.file.getName()); + } + @Test public void test_temp_resource_multifile_upload() throws IOException { resetTempResourceConfig(); From 3004c116bf554702e0f526575d7fafa961b246e0 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Thu, 9 Apr 2026 20:49:51 +0000 Subject: [PATCH 2/5] fix(rest-api): remove duplicate StandardCharsets import; test NFD normalization path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove duplicate `import java.nio.charset.StandardCharsets` from TempFileResource.java (was imported at both line 49 and 62). - Remove duplicate `import java.nio.charset.StandardCharsets` from TempFileResourceTest.java (was imported at line 9 and 50). - Update unicode filename test to use NFD-encoded input so that the Normalizer.normalize(…, NFC) step in sanitizeFileName is actually exercised (previously the test used NFC literals, making the normalisation call a no-op). Co-authored-by: Daniel Silva --- .../com/dotcms/rest/api/v1/temp/TempFileResource.java | 1 - .../com/dotcms/rest/api/v1/temp/TempFileResourceTest.java | 8 +++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/dotCMS/src/main/java/com/dotcms/rest/api/v1/temp/TempFileResource.java b/dotCMS/src/main/java/com/dotcms/rest/api/v1/temp/TempFileResource.java index 4aaa3b685fd4..1b968e6814c6 100644 --- a/dotCMS/src/main/java/com/dotcms/rest/api/v1/temp/TempFileResource.java +++ b/dotCMS/src/main/java/com/dotcms/rest/api/v1/temp/TempFileResource.java @@ -59,7 +59,6 @@ import java.io.InputStream; import java.io.OutputStream; import java.net.URL; -import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; import java.util.Map; diff --git a/dotcms-integration/src/test/java/com/dotcms/rest/api/v1/temp/TempFileResourceTest.java b/dotcms-integration/src/test/java/com/dotcms/rest/api/v1/temp/TempFileResourceTest.java index e3b08aad3175..23ac0c838114 100644 --- a/dotcms-integration/src/test/java/com/dotcms/rest/api/v1/temp/TempFileResourceTest.java +++ b/dotcms-integration/src/test/java/com/dotcms/rest/api/v1/temp/TempFileResourceTest.java @@ -6,8 +6,6 @@ import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; -import java.nio.charset.StandardCharsets; - import com.dotcms.api.web.HttpServletRequestThreadLocal; import com.dotcms.contenttype.model.field.BinaryField; import com.dotcms.contenttype.model.field.Field; @@ -49,6 +47,7 @@ import java.io.RandomAccessFile; import java.nio.charset.StandardCharsets; import java.nio.file.Path; +import java.text.Normalizer; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -181,9 +180,12 @@ public void test_temp_resource_upload_preserves_unicode_filename() throws IOExce // Jersey decodes multipart Content-Disposition filenames as ISO-8859-1. // Simulate what a macOS browser sends: NFD UTF-8 bytes re-interpreted as ISO-8859-1. + // The expected result is NFC (canonical composition); input is deliberately NFD so that + // the Normalizer.normalize(…, NFC) step in sanitizeFileName is exercised. final String expectedFileName = "Test_document_``$$#ääöüÄÖÜ.txt"; + final String nfdFileName = Normalizer.normalize(expectedFileName, Normalizer.Form.NFD); final String jerseyEncodedName = new String( - expectedFileName.getBytes(StandardCharsets.UTF_8), StandardCharsets.ISO_8859_1); + nfdFileName.getBytes(StandardCharsets.UTF_8), StandardCharsets.ISO_8859_1); final HttpServletRequest request = mockRequest(); final DotTempFile dotTempFile = saveTempFile_usingTempResource(jerseyEncodedName, request); From 98e9eec3118fdd77aa03e47584738a7ce974994d Mon Sep 17 00:00:00 2001 From: Daniel Silva Date: Thu, 9 Apr 2026 16:16:18 -0500 Subject: [PATCH 3/5] fix(rest-api): fix duplicate and misplaced imports in TempFileResource Remove duplicate java.nio.charset.StandardCharsets import and move both new imports (StandardCharsets, Normalizer) into the existing java.* group in alphabetical order. Co-Authored-By: Claude Sonnet 4.6 --- .../java/com/dotcms/rest/api/v1/temp/TempFileResource.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dotCMS/src/main/java/com/dotcms/rest/api/v1/temp/TempFileResource.java b/dotCMS/src/main/java/com/dotcms/rest/api/v1/temp/TempFileResource.java index 4aaa3b685fd4..eb8380652c82 100644 --- a/dotCMS/src/main/java/com/dotcms/rest/api/v1/temp/TempFileResource.java +++ b/dotCMS/src/main/java/com/dotcms/rest/api/v1/temp/TempFileResource.java @@ -46,8 +46,6 @@ import org.glassfish.jersey.server.JSONP; import org.jetbrains.annotations.NotNull; -import java.nio.charset.StandardCharsets; -import java.text.Normalizer; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import javax.ws.rs.*; @@ -60,6 +58,7 @@ import java.io.OutputStream; import java.net.URL; import java.nio.charset.StandardCharsets; +import java.text.Normalizer; import java.util.ArrayList; import java.util.List; import java.util.Map; From d93d23fabbf776606d5a52fea434676d2cfba8e4 Mon Sep 17 00:00:00 2001 From: Daniel Silva Date: Thu, 9 Apr 2026 16:17:11 -0500 Subject: [PATCH 4/5] fix(rest-api): strip UTF-8 replacement chars from re-decoded filenames (#35266) When a non-browser client sends a genuine ISO-8859-1 filename whose bytes are not valid UTF-8, re-decoding inserts U+FFFD replacement characters. Strip them immediately so they are not persisted in the stored filename. FileUtil.sanitizeFileName() already falls back to a random name if stripping leaves an empty string. Refs: #35266 Co-Authored-By: Claude Sonnet 4.6 --- .../java/com/dotcms/rest/api/v1/temp/TempFileResource.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dotCMS/src/main/java/com/dotcms/rest/api/v1/temp/TempFileResource.java b/dotCMS/src/main/java/com/dotcms/rest/api/v1/temp/TempFileResource.java index eb8380652c82..7e14e2906e7a 100644 --- a/dotCMS/src/main/java/com/dotcms/rest/api/v1/temp/TempFileResource.java +++ b/dotCMS/src/main/java/com/dotcms/rest/api/v1/temp/TempFileResource.java @@ -228,7 +228,8 @@ private void saveMultipleBinary(final FormDataMultiPart body, final HttpServletR // Re-interpret those bytes as UTF-8 to recover the original filename, // then normalize to NFC for consistent Unicode representation. final String raw = meta.getFileName(); - final String utf8Name = new String(raw.getBytes(StandardCharsets.ISO_8859_1), StandardCharsets.UTF_8); + final String utf8Name = new String(raw.getBytes(StandardCharsets.ISO_8859_1), StandardCharsets.UTF_8) + .replace("\uFFFD", ""); final String nfcName = Normalizer.normalize(utf8Name, Normalizer.Form.NFC); return FileUtil.sanitizeFileName(nfcName); } From d724e18f87b8bcbe0ec11a228a5112b9a296d6b5 Mon Sep 17 00:00:00 2001 From: Daniel Silva Date: Fri, 10 Apr 2026 08:02:31 -0500 Subject: [PATCH 5/5] fix(rest-api): document ISO-8859-1 assumption in sanitizeFileName (#35266) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add comment clarifying that the ISO-8859-1 → UTF-8 round-trip assumes modern browsers send UTF-8 bytes per HTML5/RFC 6266, and that high bytes from genuine ISO-8859-1 legacy clients are silently dropped. Refs: #35266 Co-Authored-By: Claude Sonnet 4.6 --- .../java/com/dotcms/rest/api/v1/temp/TempFileResource.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dotCMS/src/main/java/com/dotcms/rest/api/v1/temp/TempFileResource.java b/dotCMS/src/main/java/com/dotcms/rest/api/v1/temp/TempFileResource.java index 7e14e2906e7a..190201b6ade4 100644 --- a/dotCMS/src/main/java/com/dotcms/rest/api/v1/temp/TempFileResource.java +++ b/dotCMS/src/main/java/com/dotcms/rest/api/v1/temp/TempFileResource.java @@ -227,6 +227,9 @@ private void saveMultipleBinary(final FormDataMultiPart body, final HttpServletR // Jersey decodes multipart Content-Disposition filenames as ISO-8859-1. // Re-interpret those bytes as UTF-8 to recover the original filename, // then normalize to NFC for consistent Unicode representation. + // ASSUMPTION: modern browsers (HTML5 / RFC 6266) send UTF-8 bytes in + // Content-Disposition filenames. This round-trip silently drops high bytes + // from genuine ISO-8859-1 filenames sent by legacy or non-browser clients. final String raw = meta.getFileName(); final String utf8Name = new String(raw.getBytes(StandardCharsets.ISO_8859_1), StandardCharsets.UTF_8) .replace("\uFFFD", "");