charset: don't split multi-octet UTF-8 words in Q-encoded MIME headers

cyrusimap · Mar 11, 2019 · 3ca31fe · 3ca31fe
1 parent 0348bd7
commit 3ca31fe
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 17 deletions.
diff --git a/cunit/charset.testc b/cunit/charset.testc
@@ -274,6 +274,14 @@ static void test_encode_mimeheader(void)
         char *s = charset_encode_mimeheader(_in, 0, 0); \
         CU_ASSERT_PTR_NOT_NULL(s); \
         CU_ASSERT_STRING_EQUAL(s, _exp); \
+        const char *p, *lf; \
+        for (lf = s, p = s; *p != '\0'; p++) { \
+            if (*p == '\n') { \
+                CU_ASSERT(p - lf <= 76); \
+                lf = p; \
+            } \
+        } \
+        CU_ASSERT(p - lf <= 76); \
         free(s); \
     }
 
@@ -287,6 +295,15 @@ static void test_encode_mimeheader(void)
     /* wrap */
     TESTCASE("abc\r\n xyz", "=?UTF-8?Q?abc?=\r\n =?UTF-8?Q?xyz?=");
 
+    /* three-byte UTF-8 word barely fits line length limit */
+    TESTCASE("0123456789012345678901234567890123456789012345678901234\xe2\x82\xac",
+             "=?UTF-8?Q?0123456789012345678901234567890123456789012345678901234=E2=82=AC?=");
+
+    /* three-byte UTF-8 word must not be split */
+    TESTCASE("01234567890123456789012345678901234567890123456789012345\xe2\x82\xac",
+             "=?UTF-8?Q?01234567890123456789012345678901234567890123456789012345?="
+             "\r\n ""=?UTF-8?Q?=E2=82=AC?=");
+
 #undef TESTCASE
 }
 

diff --git a/lib/charset.c b/lib/charset.c
@@ -3058,20 +3058,26 @@ static char *qp_encode(const char *data, size_t len, int isheader,
             unsigned char this = data[n];
             unsigned char next = (n < len - 1) ? data[n+1] : '\0';
 
-            if (cnt >= ENCODED_MAX_LINE_LEN) {
-                if (!isheader) {
-                    /* add soft line break to body */
-                    buf_appendcstr(&buf, "=\r\n");
-                    cnt = 0;
-                }
-                else if (!ISUTF8CONTINUATION(this)) {
-                    /* split encoded token with fold */
-                    buf_appendcstr(&buf, "?=");
-                    buf_appendcstr(&buf, "\r\n ");
-                    buf_appendcstr(&buf, "=?UTF-8?Q?");
+            /* Insert line break before exceeding line length limits */
+            if (isheader) {
+                /* RFC2047 forbids splitting multi-octet characters */
+                int needbytes;
+                if (this < 0x80) needbytes = 0;
+                else if (this < 0xc0) needbytes = 0; // UTF-8 continuation
+                else if (this < 0xe0) needbytes = 3;
+                else if (this < 0xf0) needbytes = 6;
+                else if (this < 0xf8) needbytes = 9;
+                else needbytes = 0; // impossible UTF-8 encoding
+                if (cnt + needbytes >= ENCODED_MAX_LINE_LEN) {
+                    buf_appendcstr(&buf, "?=\r\n =?UTF-8?Q?");
                     cnt = 11;
                 }
             }
+            else if (cnt >= ENCODED_MAX_LINE_LEN) {
+                /* add soft line break to body */
+                buf_appendcstr(&buf, "=\r\n");
+                cnt = 0;
+            }
 
             if ((QPSAFECHAR[this]
                  /* per RFC 2047: '?' and '_' in header aren't safe */
@@ -3171,12 +3177,17 @@ EXPORTED char *charset_encode_mimephrase(const char *data)
     for (n = 0; data[n]; n++) {
         unsigned char this = data[n];
 
-        if (cnt >= ENCODED_MAX_LINE_LEN) {
-            if (!ISUTF8CONTINUATION(this)) {
-                /* split encoded token with fold */
-                buf_appendcstr(&buf, "?=\r\n =?UTF-8?Q?");
-                cnt = 11;
-            }
+        /* RFC2047 forbids splitting multi-octet characters */
+        int needbytes;
+        if (this < 0x80) needbytes = 0;
+        else if (this < 0xc0) needbytes = 0; // UTF-8 continuation
+        else if (this < 0xe0) needbytes = 3;
+        else if (this < 0xf0) needbytes = 6;
+        else if (this < 0xf8) needbytes = 9;
+        else needbytes = 0; // impossible UTF-8 encoding
+        if (cnt + needbytes >= ENCODED_MAX_LINE_LEN) {
+            buf_appendcstr(&buf, "?=\r\n =?UTF-8?Q?");
+            cnt = 11;
         }
 
         if (QPMIMEPHRASESAFECHAR[this]) {