Skip to content

Commit

Permalink
lib: Implement str_sanitize_utf8().
Browse files Browse the repository at this point in the history
Unlike str_sanitize(), this function truncates strings based on a UTF8 code point
limit rather than a maximum size in bytes. Also, the Unicode replacement
character is used to mark invalid/control characters and an ellipsis character
is used to indicate the string truncation. For the normal str_sanitize() this is
done using a question mark and triple dots respectively.
  • Loading branch information
stephanbosch committed Apr 17, 2018
1 parent 0ec60a1 commit a2de39b
Show file tree
Hide file tree
Showing 3 changed files with 146 additions and 1 deletion.
75 changes: 75 additions & 0 deletions src/lib/str-sanitize.c
Expand Up @@ -22,6 +22,27 @@ static size_t str_sanitize_skip_start(const char *src, size_t max_bytes)
return i;
}


static size_t
str_sanitize_skip_start_utf8(const char *src, uintmax_t max_chars)
{
unichar_t chr;
uintmax_t c;
size_t i;

for (i = 0, c = 0; c < max_chars && src[i] != '\0'; ) {
int len = uni_utf8_get_char(src+i, &chr);
if (len <= 0)
break;
if ((unsigned char)src[i] < 32)
break;
c++;
i += len;
}
i_assert(c <= max_chars);
return i;
}

static void str_sanitize_truncate_char(string_t *dest, unsigned int initial_pos)
{
const unsigned char *data = str_data(dest);
Expand Down Expand Up @@ -79,6 +100,42 @@ void str_sanitize_append(string_t *dest, const char *src, size_t max_bytes)
}
}

void str_sanitize_append_utf8(string_t *dest, const char *src,
uintmax_t max_cps)
{
size_t last_pos = 0;
unichar_t chr;
uintmax_t c;
size_t i;

i_assert(max_cps > 0);

for (i = 0, c = 0; c < max_cps && src[i] != '\0'; ) {
int len = uni_utf8_get_char(src+i, &chr);
if (len == 0)
break; /* input ended too early */

last_pos = str_len(dest);
if (len < 0) {
/* invalid UTF-8 */
str_append(dest, UNICODE_REPLACEMENT_CHAR_UTF8);
i++;
continue;
}
if ((unsigned char)src[i] < 32)
str_append(dest, UNICODE_REPLACEMENT_CHAR_UTF8);
else
str_append_n(dest, src+i, len);
i += len;
c++;
}

if (src[i] != '\0') {
str_truncate(dest, last_pos);
str_append(dest, UNICODE_HORIZONTAL_ELLIPSIS_CHAR_UTF8);
}
}

const char *str_sanitize(const char *src, size_t max_bytes)
{
string_t *str;
Expand All @@ -95,3 +152,21 @@ const char *str_sanitize(const char *src, size_t max_bytes)
str_sanitize_append(str, src, max_bytes);
return str_c(str);
}

const char *str_sanitize_utf8(const char *src, uintmax_t max_cps)
{
string_t *str;
size_t i;

if (src == NULL)
return NULL;

i = str_sanitize_skip_start_utf8(src, max_cps);
if (src[i] == '\0')
return src;

str = t_str_new(I_MIN(max_cps, 256));
str_sanitize_append_utf8(str, src, max_cps);
return str_c(str);
}

9 changes: 9 additions & 0 deletions src/lib/str-sanitize.h
Expand Up @@ -6,8 +6,17 @@
src is treated as UTF-8 input, but max_bytes is in bytes instead of
UTF-8 characters. */
void str_sanitize_append(string_t *dest, const char *src, size_t max_bytes);
/* All control characters in src will be appended as the unicode replacement
character (U+FFFD). If src has more than max_cps unicode code points, it's
truncated with a horizontal ellipsis character (U+2026) appended to the end.
*/
void str_sanitize_append_utf8(string_t *dest, const char *src,
uintmax_t max_cps);
/* Return src sanitized. If there are no changes, src pointer is returned.
If src is NULL, returns NULL. */
const char *str_sanitize(const char *src, size_t max_bytes);
/* The unicode version of str_sanitize() using str_sanitize_append_utf8()
internally. */
const char *str_sanitize_utf8(const char *src, uintmax_t max_cps);

#endif
63 changes: 62 additions & 1 deletion src/lib/test-str-sanitize.c
Expand Up @@ -10,7 +10,7 @@ struct str_sanitize_test {
const char *sanitized; /* NULL for no change */
};

void test_str_sanitize(void)
static void test_str_sanitize_max_bytes(void)
{
static const struct str_sanitize_test tests[] = {
{ NULL, 2, NULL },
Expand Down Expand Up @@ -64,3 +64,64 @@ void test_str_sanitize(void)
}
test_end();
}

static void test_str_sanitize_max_codepoints(void)
{
static const struct str_sanitize_test tests[] = {
{ NULL, 2, NULL },
{ "", 2, NULL },
{ "a", 2, NULL },
{ "ab", 2, NULL },
{ "abc", 2, "a\xE2\x80\xA6" },
{ "abcd", 3, "ab\xE2\x80\xA6" },
{ "abcde", 4, "abc\xE2\x80\xA6" },
{ "\xD1\x81", 1, "\xD1\x81" },
{ "\xD1\x81", 2, "\xD1\x81" },
{ "\xD1\x81", 3, NULL },
{ "\xC3\xA4\xC3\xA4zyxa", 1, "\xE2\x80\xA6" },
{ "\xC3\xA4\xC3\xA4zyxa", 2, "\xC3\xA4\xE2\x80\xA6" },
{ "\xC3\xA4\xC3\xA4zyxa", 3, "\xC3\xA4\xC3\xA4\xE2\x80\xA6" },
{ "\xC3\xA4\xC3\xA4zyxa", 4, "\xC3\xA4\xC3\xA4z\xE2\x80\xA6" },
{ "\xC3\xA4\xC3\xA4zyxa", 5, "\xC3\xA4\xC3\xA4zy\xE2\x80\xA6" },
{ "\xC3\xA4\xC3\xA4zyxa", 6, "\xC3\xA4\xC3\xA4zyxa" },
{ "\xC3\xA4\xC3\xA4zyxa", 7, "\xC3\xA4\xC3\xA4zyxa" },
{ "\xC3\xA4\xC3\xA4zyxa", 8, "\xC3\xA4\xC3\xA4zyxa" },
{ "\001x\x1fy\x81", 10, "\xEF\xBF\xBDx\xEF\xBF\xBDy\xEF\xBF\xBD" }
};
const char *str;
string_t *str2;
unsigned int i;

test_begin("str_sanitize_utf8");
for (i = 0; i < N_ELEMENTS(tests); i++) {
str = str_sanitize_utf8(tests[i].str, tests[i].max_len);
if (tests[i].sanitized != NULL)
test_assert_idx(null_strcmp(str, tests[i].sanitized) == 0, i);
else
test_assert_idx(str == tests[i].str, i);
}
test_end();

test_begin("str_sanitize_append_utf8");
str2 = t_str_new(128);
for (i = 0; i < N_ELEMENTS(tests); i++) {
if (tests[i].str == NULL)
continue;
str_truncate(str2, 0);
str_append(str2, "1234567890");
str_sanitize_append_utf8(str2, tests[i].str, tests[i].max_len);

test_assert_idx(strncmp(str_c(str2), "1234567890", 10) == 0, i);
if (tests[i].sanitized != NULL)
test_assert_idx(strcmp(str_c(str2)+10, tests[i].sanitized) == 0, i);
else
test_assert_idx(strcmp(str_c(str2)+10, tests[i].str) == 0, i);
}
test_end();
}

void test_str_sanitize(void)
{
test_str_sanitize_max_bytes();
test_str_sanitize_max_codepoints();
}

0 comments on commit a2de39b

Please sign in to comment.