diff --git a/dotCMS/src/main/java/com/dotcms/rest/api/v1/temp/TempFileResource.java b/dotCMS/src/main/java/com/dotcms/rest/api/v1/temp/TempFileResource.java index 1d832364db10..190201b6ade4 100644 --- a/dotCMS/src/main/java/com/dotcms/rest/api/v1/temp/TempFileResource.java +++ b/dotCMS/src/main/java/com/dotcms/rest/api/v1/temp/TempFileResource.java @@ -22,6 +22,7 @@ import com.dotmarketing.exception.DoesNotExistException; import com.dotmarketing.portlets.workflows.business.WorkflowAPI; import com.dotmarketing.util.Config; +import com.dotmarketing.util.FileUtil; import com.dotmarketing.util.Logger; import com.dotmarketing.util.PageMode; import com.dotmarketing.util.UtilMethods; @@ -57,6 +58,7 @@ import java.io.OutputStream; import java.net.URL; import java.nio.charset.StandardCharsets; +import java.text.Normalizer; import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -222,8 +224,17 @@ private void saveMultipleBinary(final FormDataMultiPart body, final HttpServletR } private static @NotNull String sanitizeFileName(ContentDisposition meta) { - final String sanitize = meta.getFileName().replaceAll("[^\\x00-\\x7F]", StringPool.BLANK); - return sanitize; + // Jersey decodes multipart Content-Disposition filenames as ISO-8859-1. + // Re-interpret those bytes as UTF-8 to recover the original filename, + // then normalize to NFC for consistent Unicode representation. + // ASSUMPTION: modern browsers (HTML5 / RFC 6266) send UTF-8 bytes in + // Content-Disposition filenames. This round-trip silently drops high bytes + // from genuine ISO-8859-1 filenames sent by legacy or non-browser clients. + final String raw = meta.getFileName(); + final String utf8Name = new String(raw.getBytes(StandardCharsets.ISO_8859_1), StandardCharsets.UTF_8) + .replace("\uFFFD", ""); + final String nfcName = Normalizer.normalize(utf8Name, Normalizer.Form.NFC); + return FileUtil.sanitizeFileName(nfcName); } private void printResponseEntityViewResult(final OutputStream outputStream, diff --git a/dotcms-integration/src/test/java/com/dotcms/rest/api/v1/temp/TempFileResourceTest.java b/dotcms-integration/src/test/java/com/dotcms/rest/api/v1/temp/TempFileResourceTest.java index 8570b15b1773..23ac0c838114 100644 --- a/dotcms-integration/src/test/java/com/dotcms/rest/api/v1/temp/TempFileResourceTest.java +++ b/dotcms-integration/src/test/java/com/dotcms/rest/api/v1/temp/TempFileResourceTest.java @@ -47,6 +47,7 @@ import java.io.RandomAccessFile; import java.nio.charset.StandardCharsets; import java.nio.file.Path; +import java.text.Normalizer; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -172,6 +173,27 @@ public void test_temp_resource_upload() throws IOException { assertTrue(dotTempFileOpt.get().length() > 0); } + @Test + public void test_temp_resource_upload_preserves_unicode_filename() throws IOException { + resetTempResourceConfig(); + Config.setProperty(TempFileAPI.TEMP_RESOURCE_ALLOW_ANONYMOUS, true); + + // Jersey decodes multipart Content-Disposition filenames as ISO-8859-1. + // Simulate what a macOS browser sends: NFD UTF-8 bytes re-interpreted as ISO-8859-1. + // The expected result is NFC (canonical composition); input is deliberately NFD so that + // the Normalizer.normalize(…, NFC) step in sanitizeFileName is exercised. + final String expectedFileName = "Test_document_``$$#ääöüÄÖÜ.txt"; + final String nfdFileName = Normalizer.normalize(expectedFileName, Normalizer.Form.NFD); + final String jerseyEncodedName = new String( + nfdFileName.getBytes(StandardCharsets.UTF_8), StandardCharsets.ISO_8859_1); + + final HttpServletRequest request = mockRequest(); + final DotTempFile dotTempFile = saveTempFile_usingTempResource(jerseyEncodedName, request); + + assertEquals("Unicode characters must be preserved in the uploaded filename", + expectedFileName, dotTempFile.file.getName()); + } + @Test public void test_temp_resource_multifile_upload() throws IOException { resetTempResourceConfig();