Skip to content
Browse files

Add an encoder option to escape unicode.

The encoder can now return \u escaped unicode data instead of leaving
it as UTF-8 byte sequences. This done like so:

    Eshell V5.8.3  (abort with ^G)
    1> jiffy:encode(<<240, 144, 129, 128>>, [uescape]).
    <<"\"\\uD800\\uDC40\"">>
  • Loading branch information...
1 parent a2a7bc9 commit 2305ded365fe12360c9c482e8d695c6e1509d2f9 @davisp davisp committed May 28, 2011
Showing with 204 additions and 120 deletions.
  1. +12 −47 c_src/decoder.c
  2. +47 −41 c_src/encoder.c
  3. +2 −1 c_src/jiffy.c
  4. +7 −2 c_src/jiffy.h
  5. +128 −25 c_src/utf8.c
  6. +7 −3 src/jiffy.erl
  7. +1 −1 test/cases/string_invalid_hex_char.erl
View
59 c_src/decoder.c
@@ -196,10 +196,10 @@ dec_string(Decoder* d, ERL_NIF_TERM* value)
return 0;
}
hi = int_from_hex(&(d->u[d->i]));
- d->i += 4;
if(hi < 0) {
return 0;
}
+ d->i += 4;
if(hi >= 0xD800 && hi < 0xDC00) {
if(d->i + 6 >= d->len) {
return 0;
@@ -213,7 +213,7 @@ dec_string(Decoder* d, ERL_NIF_TERM* value)
if(lo < 0) {
return 0;
}
- hi = utf8_from_pair(hi, lo);
+ hi = unicode_from_pair(hi, lo);
if(hi < 0) {
return 0;
}
@@ -234,52 +234,11 @@ dec_string(Decoder* d, ERL_NIF_TERM* value)
} else if(d->u[d->i] < 0x80) {
d->i++;
} else {
- ulen = -1;
- if((d->u[d->i] & 0xE0) == 0xC0) {
- ulen = 1;
- } else if((d->u[d->i] & 0xF0) == 0xE0) {
- ulen = 2;
- } else if((d->u[d->i] & 0xF8) == 0xF0) {
- ulen = 3;
- } else if((d->u[d->i] & 0xFC) == 0xF8) {
- ulen = 4;
- } else if((d->u[d->i] & 0xFE) == 0xFC) {
- ulen = 5;
- }
+ ulen = utf8_validate(&(d->u[d->i]), d->len - d->i);
if(ulen < 0) {
return 0;
}
- if(d->i + ulen >= d->len) {
- return 0;
- }
- for(ui = 0; ui < ulen; ui++) {
- if((d->u[d->i+1+ui] & 0xC0) != 0x80) {
- return 0;
- }
- }
- // Wikipedia says I have to check that a UTF-8 encoding
- // uses as few bits as possible. This means that we
- // can't do things like encode 't' in three bytes.
- // To check this all we need to ensure is that for each
- // of the following bit patterns that there is at least
- // one 1 bit in any of the x's
- // 11: 110xxxxy 10yyyyyy
- // 16: 1110xxxx 10xyyyyy 10yyyyyy
- // 21: 11110xxx 10xxyyyy 10yyyyyy 10yyyyyy
- // 26: 111110xx 10xxxyyy 10yyyyyy 10yyyyyy 10yyyyyy
- // 31: 1111110x 10xxxxyy 10yyyyyy 10yyyyyy 10yyyyyy 10yyyyyy
- if(ulen == 1) {
- if((d->u[d->i] & 0x1E) == 0) return 0;
- } else if(ulen == 2) {
- if((d->u[d->i] & 0x0F) + (d->u[d->i+1] & 0x20) == 0) return 0;
- } else if(ulen == 3) {
- if((d->u[d->i] & 0x07) + (d->u[d->i+1] & 0x30) == 0) return 0;
- } else if(ulen == 4) {
- if((d->u[d->i] & 0x03) + (d->u[d->i+1] & 0x38) == 0) return 0;
- } else if(ulen == 5) {
- if((d->u[d->i] & 0x01) + (d->u[d->i+1] & 0x3C) == 0) return 0;
- }
- d->i += 1 + ulen;
+ d->i += ulen;
}
}
@@ -336,14 +295,20 @@ dec_string(Decoder* d, ERL_NIF_TERM* value)
case 'u':
ui++;
hi = int_from_hex(&(d->u[ui]));
+ if(hi < 0) {
+ return 0;
+ }
if(hi >= 0xD800 && hi < 0xDC00) {
lo = int_from_hex(&(d->u[ui+6]));
- hi = utf8_from_pair(hi, lo);
+ if(lo < 0) {
+ return 0;
+ }
+ hi = unicode_from_pair(hi, lo);
ui += 10;
} else {
ui += 4;
}
- hi = utf8_to_binary(hi, (unsigned char*) chrbuf+chrpos);
+ hi = unicode_to_utf8(hi, (unsigned char*) chrbuf+chrpos);
if(hi < 0) {
return 0;
}
View
88 c_src/encoder.c
@@ -14,6 +14,7 @@
typedef struct {
ErlNifEnv* env;
jiffy_st* atoms;
+ int uescape;
int count;
@@ -28,12 +29,26 @@ typedef struct {
} Encoder;
int
-enc_init(Encoder* e, ErlNifEnv* env, ErlNifBinary* bin)
+enc_init(Encoder* e, ErlNifEnv* env, ERL_NIF_TERM opts, ErlNifBinary* bin)
{
+ ERL_NIF_TERM val;
+
e->env = env;
e->atoms = enif_priv_data(env);
+ e->uescape = 0;
e->count = 0;
+ if(!enif_is_list(env, opts)) {
+ return 0;
+ }
+
+ while(enif_get_list_cell(env, opts, &val, &opts)) {
+ if(enif_compare(val, e->atoms->atom_uescape) == 0) {
+ e->uescape = 1;
+ } else {
+ return 0;
+ }
+ }
e->iolen = 0;
e->iolist = enif_make_list(env, 0);
@@ -183,7 +198,7 @@ enc_string(Encoder* e, ERL_NIF_TERM val)
int esc_extra = 0;
int ulen;
- int ui;
+ int uval;
int i;
if(enif_is_binary(e->env, val)) {
@@ -225,46 +240,21 @@ enc_string(Encoder* e, ERL_NIF_TERM val)
i++;
continue;
}
- ulen = -1;
- if((data[i] & 0xE0) == 0xC0) {
- ulen = 1;
- } else if((data[i] & 0xF0) == 0xE0) {
- ulen = 2;
- } else if((data[i] & 0xF8) == 0xF0) {
- ulen = 3;
- } else if((data[i] & 0xFC) == 0xF8) {
- ulen = 4;
- } else if((data[i] & 0xFE) == 0xFC) {
- ulen = 5;
- }
+ ulen = utf8_validate(&(data[i]), size - i);
if(ulen < 0) {
return 0;
}
- if(i+1+ulen > size) {
- return 0;
- }
- for(ui = 0; ui < ulen; ui++) {
- if((data[i+1+ui] & 0xC0) != 0x80) {
+ if(e->uescape) {
+ uval = utf8_to_unicode(&(data[i]), ulen);
+ if(uval < 0) {
return 0;
}
- }
- if(ulen == 1) {
- if((data[i] & 0x1E) == 0)
- return 0;
- } else if(ulen == 2) {
- if((data[i] & 0x0F) + (data[i+1] & 0x20) == 0)
- return 0;
- } else if(ulen == 3) {
- if((data[i] & 0x07) + (data[i+1] & 0x30) == 0)
- return 0;
- } else if(ulen == 4) {
- if((data[i] & 0x03) + (data[i+1] & 0x38) == 0)
- return 0;
- } else if(ulen == 5) {
- if((data[i] & 0x01) + (data[i+1] & 0x3C) == 0)
+ ulen = utf8_esc_len(uval);
+ if(ulen < 0) {
return 0;
+ }
}
- i += 1 + ulen;
+ i += ulen;
}
}
@@ -311,13 +301,29 @@ enc_string(Encoder* e, ERL_NIF_TERM val)
continue;
default:
if(data[i] < 0x20) {
- e->p[e->i++] = '\\';
- e->p[e->i++] = 'u';
- if(!int_to_hex(data[i], &(e->p[e->i]))) {
+ ulen = unicode_uescape(data[i], &(e->p[e->i]));
+ if(ulen < 0) {
return 0;
}
- e->i += 4;
+ e->i += ulen;
i++;
+ } else if((data[i] & 0x80) && e->uescape) {
+ uval = utf8_to_unicode(&(data[i]), size-i);
+ if(uval < 0) {
+ return 0;
+ }
+
+ ulen = unicode_uescape(uval, &(e->p[e->i]));
+ if(ulen < 0) {
+ return 0;
+ }
+ e->i += ulen;
+
+ ulen = utf8_len(uval);
+ if(ulen < 0) {
+ return 0;
+ }
+ i += ulen;
} else {
e->u[e->i++] = data[i++];
}
@@ -424,11 +430,11 @@ encode(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
double dval;
long lval;
- if(argc != 1) {
+ if(argc != 2) {
return enif_make_badarg(env);
}
- if(!enc_init(e, env, &bin)) {
+ if(!enc_init(e, env, argv[1], &bin)) {
return enif_make_badarg(env);
}
View
3 c_src/jiffy.c
@@ -20,6 +20,7 @@ load(ErlNifEnv* env, void** priv, ERL_NIF_TERM info)
st->atom_bignum_e = make_atom(env, "bignum_e");
st->atom_bigdbl = make_atom(env, "bigdbl");
st->atom_partial = make_atom(env, "partial");
+ st->atom_uescape = make_atom(env, "uescape");
// Markers used in encoding
st->ref_object = make_atom(env, "$object_ref$");
@@ -53,7 +54,7 @@ unload(ErlNifEnv* env, void* priv)
static ErlNifFunc funcs[] =
{
{"nif_decode", 1, decode},
- {"nif_encode", 1, encode}
+ {"nif_encode", 2, encode}
};
ERL_NIF_INIT(jiffy, funcs, &load, &reload, &upgrade, &unload);
View
9 c_src/jiffy.h
@@ -16,6 +16,7 @@ typedef struct {
ERL_NIF_TERM atom_bignum_e;
ERL_NIF_TERM atom_bigdbl;
ERL_NIF_TERM atom_partial;
+ ERL_NIF_TERM atom_uescape;
ERL_NIF_TERM ref_object;
ERL_NIF_TERM ref_array;
@@ -31,7 +32,11 @@ ERL_NIF_TERM encode(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]);
int int_from_hex(const unsigned char* p);
int int_to_hex(int val, char* p);
int utf8_len(int c);
-int utf8_from_pair(int hi, int lo);
-int utf8_to_binary(int c, unsigned char* buf);
+int utf8_esc_len(int c);
+int utf8_validate(unsigned char* data, size_t size);
+int utf8_to_unicode(unsigned char* buf, size_t size);
+int unicode_to_utf8(int c, unsigned char* buf);
+int unicode_from_pair(int hi, int lo);
+int unicode_uescape(int c, char* buf);
#endif // Included JIFFY_H
View
153 c_src/utf8.c
@@ -1,5 +1,7 @@
// This file is part of Jiffy released under the MIT license.
// See the LICENSE file for more information.
+#include "jiffy.h"
+#include <stdio.h>
static const char hexvals[256] = {
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
@@ -42,7 +44,7 @@ int
int_to_hex(int val, char* p)
{
if(val < 0 || val > 65535)
- return 0;
+ return -1;
p[0] = hexdigits[(val >> 12) & 0xF];
p[1] = hexdigits[(val >> 8) & 0xF];
@@ -65,27 +67,107 @@ utf8_len(int c)
} else {
return -1;
}
- } else if(c < 0x200000) {
+ } else if(c <= 0x10FFFF) {
return 4;
- } else if(c < 0x4000000) {
- return 5;
- } else if(c < 0x80000000) {
+ } else {
+ return -1;
+ }
+}
+
+int
+utf8_esc_len(int c)
+{
+ if(c < 0x10000) {
return 6;
+ } else if(c <= 0x10FFFF) {
+ return 12;
} else {
return -1;
}
}
int
-utf8_from_pair(int hi, int lo)
+utf8_validate(unsigned char* data, size_t size)
{
- if(hi < 0xD800 || hi >= 0xDC00) return -1;
- if(lo < 0xDC00 || lo > 0xDFFF) return -1;
- return ((hi & 0x3FF) << 10) + (lo & 0x3FF) + 0x10000;
+ int ulen = -1;
+ int ui;
+
+ if((data[0] & 0x80) == 0x00) {
+ ulen = 1;
+ } if((data[0] & 0xE0) == 0xC0) {
+ ulen = 2;
+ } else if((data[0] & 0xF0) == 0xE0) {
+ ulen = 3;
+ } else if((data[0] & 0xF8) == 0xF0) {
+ ulen = 4;
+ }
+ if(ulen < 0 || ulen > size) {
+ return -1;
+ }
+
+ // Check each continuation byte.
+ for(ui = 1; ui < ulen; ui++) {
+ if((data[ui] & 0xC0) != 0x80) return -1;
+ }
+
+ // Wikipedia says I have to check that a UTF-8 encoding
+ // uses as few bits as possible. This means that we
+ // can't do things like encode 't' in three bytes.
+ // To check this all we need to ensure is that for each
+ // of the following bit patterns that there is at least
+ // one 1 bit in any of the x's
+ // 1: 0yyyyyyy
+ // 2: 110xxxxy 10yyyyyy
+ // 3: 1110xxxx 10xyyyyy 10yyyyyy
+ // 4: 11110xxx 10xxyyyy 10yyyyyy 10yyyyyy
+
+ // ulen == 1 passes by definition
+ if(ulen == 2) {
+ if((data[0] & 0x1E) == 0)
+ return -1;
+ } else if(ulen == 3) {
+ if((data[0] & 0x0F) + (data[1] & 0x20) == 0)
+ return -1;
+ } else if(ulen == 4) {
+ if((data[0] & 0x07) + (data[1] & 0x30) == 0)
+ return -1;
+ }
+ return ulen;
+}
+
+int
+utf8_to_unicode(unsigned char* buf, size_t size)
+{
+ int ret;
+ if((buf[0] & 0x80) == 0x00) {
+ // 0xxxxxxx
+ ret = (int) buf[0];
+ } else if((buf[0] & 0xE0) == 0xC0 && size >= 2) {
+ // 110xxxxy 10yyyyyy
+ ret = ((buf[0] & 0x1F) << 6)
+ | ((buf[1] & 0x3F));
+ } else if((buf[0] & 0xF0) == 0xE0 && size >= 3) {
+ // 1110xxxx 10xyyyyy 10yyyyyy
+ ret = ((buf[0] & 0x0F) << 12)
+ | ((buf[1] & 0x3F) << 6)
+ | ((buf[2] & 0x3F));
+ if(ret >= 0xD800 && ret <= 0xDFFF) {
+ ret = -1;
+ }
+ } else if((buf[0] & 0xF8) == 0xF0 && size >= 4) {
+ // 11110xxx 10xxyyyy 10yyyyyy 10yyyyyy
+ ret = ((buf[0] & 0x07) << 18)
+ | ((buf[1] & 0x3F) << 12)
+ | ((buf[2] & 0x3F) << 6)
+ | ((buf[3] & 0x3F));
+ } else {
+ ret = -1;
+ }
+ return ret;
}
int
-utf8_to_binary(int c, unsigned char* buf)
+unicode_to_utf8(int c, unsigned char* buf)
{
if(c < 0x80) {
buf[0] = (unsigned char) c;
@@ -103,27 +185,48 @@ utf8_to_binary(int c, unsigned char* buf)
} else {
return -1;
}
- } else if(c < 0x200000) {
+ } else if(c < 0x10FFFF) {
buf[0] = (unsigned char) 0xF0 + (c >> 18);
buf[1] = (unsigned char) 0x80 + ((c >> 12) & 0x3F);
buf[2] = (unsigned char) 0x80 + ((c >> 6) & 0x3F);
buf[3] = (unsigned char) 0x80 + (c & 0x3F);
return 4;
- } else if(c < 0x4000000) {
- buf[0] = (unsigned char) 0xF8 + (c >> 24);
- buf[1] = (unsigned char) 0x80 + ((c >> 18) & 0x3F);
- buf[2] = (unsigned char) 0x80 + ((c >> 12) & 0x3F);
- buf[3] = (unsigned char) 0x80 + ((c >> 6) & 0x3F);
- buf[4] = (unsigned char) 0x80 + (c & 0x3F);
- return 5;
- } else if(c < 0x80000000) {
- buf[0] = (unsigned char) 0xFC + (c >> 30);
- buf[1] = (unsigned char) 0x80 + ((c >> 24) & 0x3F);
- buf[2] = (unsigned char) 0x80 + ((c >> 18) & 0x3F);
- buf[3] = (unsigned char) 0x80 + ((c >> 12) & 0x3F);
- buf[4] = (unsigned char) 0x80 + ((c >> 6) & 0x3F);
- buf[5] = (unsigned char) 0x80 + (c & 0x3F);
+ }
+ return -1;
+}
+
+int
+unicode_from_pair(int hi, int lo)
+{
+ if(hi < 0xD800 || hi >= 0xDC00) return -1;
+ if(lo < 0xDC00 || lo > 0xDFFF) return -1;
+ return ((hi & 0x3FF) << 10) + (lo & 0x3FF) + 0x10000;
+}
+
+int
+unicode_uescape(int val, char* p)
+{
+ int n;
+ if(val < 0x10000) {
+ p[0] = '\\';
+ p[1] = 'u';
+ if(int_to_hex(val, p+2) < 0) {
+ return -1;
+ }
return 6;
+ } else if (val <= 0x10FFFF) {
+ n = val - 0x10000;
+ p[0] = '\\';
+ p[1] = 'u';
+ if(int_to_hex((0xD800 | ((n << 10) & 0x03FF)), p+2) < 0) {
+ return -1;
+ }
+ p[6] = '\\';
+ p[7] = 'u';
+ if(int_to_hex((0xDC00 | (n & 0x03FF)), p+8) < 0) {
+ return -1;
+ }
+ return 12;
}
return -1;
}
View
10 src/jiffy.erl
@@ -2,7 +2,7 @@
% See the LICENSE file for more information.
-module(jiffy).
--export([decode/1, encode/1]).
+-export([decode/1, encode/1, encode/2]).
-define(NOT_LOADED, not_loaded(?LINE)).
-on_load(init/0).
@@ -19,7 +19,11 @@ decode(Data) ->
encode(Data) ->
- case nif_encode(Data) of
+ encode(Data, []).
+
+
+encode(Data, Options) ->
+ case nif_encode(Data, Options) of
{error, _} = Error ->
throw(Error);
{partial, IOData} ->
@@ -95,6 +99,6 @@ not_loaded(Line) ->
nif_decode(_Data) ->
?NOT_LOADED.
-nif_encode(_Data) ->
+nif_encode(_Data, _Options) ->
?NOT_LOADED.
View
2 test/cases/string_invalid_hex_char.erl
@@ -1 +1 @@
-{error,{48,invalid_string}}.
+{error,{44,invalid_string}}.

0 comments on commit 2305ded

Please sign in to comment.
Something went wrong with that request. Please try again.