diff --git a/ext/oj/dump.c b/ext/oj/dump.c index ad36f941..f3070c67 100644 --- a/ext/oj/dump.c +++ b/ext/oj/dump.c @@ -30,6 +30,7 @@ #include #include +#include #include #include #include @@ -89,7 +90,7 @@ static int dump_attr_cb(ID key, VALUE value, Out out); static void dump_obj_attrs(VALUE obj, int with_class, slot_t id, int depth, Out out); static void grow(Out out, size_t len); -static size_t json_friendly_size(const u_char *str, size_t len); +static size_t hibit_friendly_size(const u_char *str, size_t len); static size_t ascii_friendly_size(const u_char *str, size_t len); static void dump_leaf_to_json(Leaf leaf, Options copts, Out out); @@ -103,7 +104,7 @@ static void dump_leaf_hash(Leaf leaf, int depth, Out out); static const char hex_chars[17] = "0123456789abcdef"; -static char json_friendly_chars[256] = "\ +static char hibit_friendly_chars[256] = "\ 66666666222622666666666666666666\ 11211111111111121111111111111111\ 11111111111111111111111111112111\ @@ -113,22 +114,24 @@ static char json_friendly_chars[256] = "\ 11111111111111111111111111111111\ 11111111111111111111111111111111"; +// High bit set characters are always encoded as unicode. Worse case is 3 +// bytes per character in the output. That makes this conservative. static char ascii_friendly_chars[256] = "\ 66666666222622666666666666666666\ 11211111111111121111111111111111\ 11111111111111111111111111112111\ 11111111111111111111111111111116\ -66666666666666666666666666666666\ -66666666666666666666666666666666\ -66666666666666666666666666666666\ -66666666666666666666666666666666"; +33333333333333333333333333333333\ +33333333333333333333333333333333\ +33333333333333333333333333333333\ +33333333333333333333333333333333"; inline static size_t -json_friendly_size(const u_char *str, size_t len) { +hibit_friendly_size(const u_char *str, size_t len) { size_t size = 0; for (; 0 < len; str++, len--) { - size += json_friendly_chars[*str]; + size += hibit_friendly_chars[*str]; } return size - len * (size_t)'0'; } @@ -215,6 +218,58 @@ dump_hex(u_char c, Out out) { *out->cur++ = hex_chars[d]; } +const char* +dump_unicode(const char *str, const char *end, Out out) { + uint32_t code = 0; + uint8_t b = *(uint8_t*)str; + int i, cnt; + + if (0xC0 == (0xE0 & b)) { + cnt = 1; + code = b & 0x0000001F; + } else if (0xE0 == (0xF0 & b)) { + cnt = 2; + code = b & 0x0000000F; + } else if (0xF0 == (0xF8 & b)) { + cnt = 3; + code = b & 0x00000007; + } else if (0xF8 == (0xFC & b)) { + cnt = 4; + code = b & 0x00000003; + } else if (0xFC == (0xFE & b)) { + cnt = 5; + code = b & 0x00000001; + } else { + rb_raise(rb_eEncodingError, "Invalid Unicode\n"); + } + str++; + for (; 0 < cnt; cnt--, str++) { + b = *(uint8_t*)str; + if (end <= str || 0x80 != (0xC0 & b)) { + rb_raise(rb_eEncodingError, "Invalid Unicode\n"); + } + code = (code << 6) | (b & 0x0000003F); + } + if (0x0000FFFF < code) { + uint32_t c1; + + code -= 0x00010000; + c1 = ((code >> 10) & 0x000003FF) + 0x0000D800; + code = (code & 0x000003FF) + 0x0000DC00; + *out->cur++ = '\\'; + *out->cur++ = 'u'; + for (i = 3; 0 <= i; i--) { + *out->cur++ = hex_chars[(uint8_t)(c1 >> (i * 4)) & 0x0F]; + } + } + *out->cur++ = '\\'; + *out->cur++ = 'u'; + for (i = 3; 0 <= i; i--) { + *out->cur++ = hex_chars[(uint8_t)(code >> (i * 4)) & 0x0F]; + } + return str - 1; +} + // returns 0 if not using circular references, -1 if not further writing is // needed (duplicate), and a positive value if the object was added to the cache. static long @@ -383,8 +438,8 @@ dump_cstr(const char *str, size_t cnt, int is_sym, int escape1, Out out) { cmap = ascii_friendly_chars; size = ascii_friendly_size((u_char*)str, cnt); } else { - cmap = json_friendly_chars; - size = json_friendly_size((u_char*)str, cnt); + cmap = hibit_friendly_chars; + size = hibit_friendly_size((u_char*)str, cnt); } if (out->end - out->cur <= (long)size + 10) { // extra 10 for escaped first char, quotes, and sym grow(out, size + 10); @@ -410,10 +465,12 @@ dump_cstr(const char *str, size_t cnt, int is_sym, int escape1, Out out) { } *out->cur++ = '"'; } else { + const char *end = str + cnt; + if (is_sym) { *out->cur++ = ':'; } - for (; 0 < cnt; cnt--, str++) { + for (; str < end; str++) { switch (cmap[(u_char)*str]) { case '1': *out->cur++ = *str; @@ -429,18 +486,15 @@ dump_cstr(const char *str, size_t cnt, int is_sym, int escape1, Out out) { default: *out->cur++ = *str; break; } break; - case '6': + case '3': // Unicode + str = dump_unicode(str, end, out); + break; + case '6': // control characters *out->cur++ = '\\'; *out->cur++ = 'u'; - if ((u_char)*str <= 0x7F) { - *out->cur++ = '0'; - *out->cur++ = '0'; - dump_hex((u_char)*str, out); - } else { // continuation? - *out->cur++ = '0'; - *out->cur++ = '0'; - dump_hex((u_char)*str, out); - } + *out->cur++ = '0'; + *out->cur++ = '0'; + dump_hex((u_char)*str, out); break; default: break; // ignore, should never happen if the table is correct @@ -847,16 +901,17 @@ dump_hash(VALUE obj, int depth, int mode, Out out) { static void dump_time(VALUE obj, Out out) { - char buf[64]; - char *b = buf + sizeof(buf) - 1; - time_t sec = NUM2LONG(rb_funcall2(obj, oj_tv_sec_id, 0, 0)); - long usec = NUM2LONG(rb_funcall2(obj, oj_tv_usec_id, 0, 0)); - char *dot = b - 7; - long size; + char buf[64]; + char *b = buf + sizeof(buf) - 1; + char *dot = b - 10; + long size; + struct timespec ts = rb_time_timespec(obj); + time_t sec = ts.tv_sec; + long nsec = ts.tv_nsec; *b-- = '\0'; - for (; dot < b; b--, usec /= 10) { - *b = '0' + (usec % 10); + for (; dot < b; b--, nsec /= 10) { + *b = '0' + (nsec % 10); } *b-- = '.'; for (; 0 < sec; b--, sec /= 10) { diff --git a/ext/oj/fast.c b/ext/oj/fast.c index cd21ba5f..f8616f04 100644 --- a/ext/oj/fast.c +++ b/ext/oj/fast.c @@ -774,11 +774,6 @@ doc_init(Doc doc) { *doc->where = 0; doc->data = 0; doc->self = Qundef; -#ifdef HAVE_RUBY_ENCODING_H - doc->encoding = oj_default_options.encoding; -#else - doc->encoding = 0; -#endif doc->size = 0; doc->json = 0; doc->batches = &doc->batch0; diff --git a/ext/oj/load.c b/ext/oj/load.c index 3185b86d..0f291491 100644 --- a/ext/oj/load.c +++ b/ext/oj/load.c @@ -50,11 +50,6 @@ typedef struct _ParseInfo { char *str; /* buffer being read from */ char *s; /* current position in buffer */ CircArray circ_array; -#ifdef HAVE_RUBY_ENCODING_H - rb_encoding *encoding; -#else - void *encoding; -#endif Options options; } *ParseInfo; @@ -602,20 +597,14 @@ read_str(ParseInfo pi, int hint) { case T_STRING: obj = rb_str_new2(text); #ifdef HAVE_RUBY_ENCODING_H - if (0 != pi->encoding) { - rb_enc_associate(obj, pi->encoding); - } + rb_enc_associate(obj, oj_utf8_encoding); #endif break; case T_SYMBOL: #ifdef HAVE_RUBY_ENCODING_H - if (0 != pi->encoding) { - obj = rb_str_new2(text); - rb_enc_associate(obj, pi->encoding); - obj = rb_funcall(obj, oj_to_sym_id, 0); - } else { - obj = ID2SYM(rb_intern(text)); - } + obj = rb_str_new2(text); + rb_enc_associate(obj, oj_utf8_encoding); + obj = rb_funcall(obj, oj_to_sym_id, 0); #else obj = ID2SYM(rb_intern(text)); #endif @@ -625,13 +614,9 @@ read_str(ParseInfo pi, int hint) { obj = Qundef; if (':' == *text && !escaped) { // Symbol #ifdef HAVE_RUBY_ENCODING_H - if (0 != pi->encoding) { - obj = rb_str_new2(text + 1); - rb_enc_associate(obj, pi->encoding); - obj = rb_funcall(obj, oj_to_sym_id, 0); - } else { - obj = ID2SYM(rb_intern(text + 1)); - } + obj = rb_str_new2(text + 1); + rb_enc_associate(obj, oj_utf8_encoding); + obj = rb_funcall(obj, oj_to_sym_id, 0); #else obj = ID2SYM(rb_intern(text + 1)); #endif @@ -647,9 +632,7 @@ read_str(ParseInfo pi, int hint) { if (Qundef == obj) { obj = rb_str_new2(text); #ifdef HAVE_RUBY_ENCODING_H - if (0 != pi->encoding) { - rb_enc_associate(obj, pi->encoding); - } + rb_enc_associate(obj, oj_utf8_encoding); #endif } break; @@ -749,23 +732,24 @@ read_num(ParseInfo pi) { static VALUE read_time(ParseInfo pi) { - VALUE args[2]; - long v = 0; - long v2 = 0; + time_t v = 0; + long v2 = 0; for (; '0' <= *pi->s && *pi->s <= '9'; pi->s++) { v = v * 10 + (*pi->s - '0'); } if ('.' == *pi->s) { + int cnt; + pi->s++; - for (; '0' <= *pi->s && *pi->s <= '9'; pi->s++) { + for (cnt = 9; 0 < cnt && '0' <= *pi->s && *pi->s <= '9'; pi->s++, cnt--) { v2 = v2 * 10 + (*pi->s - '0'); } + for (; 0 < cnt; cnt--) { + v2 *= 10; + } } - args[0] = LONG2NUM(v); - args[1] = LONG2NUM(v2); - - return rb_funcall2(oj_time_class, oj_at_id, 2, args); + return rb_time_nano_new(v, v2); } static VALUE @@ -801,33 +785,61 @@ read_nil(ParseInfo pi) { return Qnil; } -static char +static uint32_t read_hex(ParseInfo pi, char *h) { - uint8_t b = 0; - - if ('0' <= *h && *h <= '9') { - b = *h - '0'; - } else if ('A' <= *h && *h <= 'F') { - b = *h - 'A' + 10; - } else if ('a' <= *h && *h <= 'f') { - b = *h - 'a' + 10; - } else { - pi->s = h; - raise_error("invalid hex character", pi->str, pi->s); - } - h++; - b = b << 4; - if ('0' <= *h && *h <= '9') { - b += *h - '0'; - } else if ('A' <= *h && *h <= 'F') { - b += *h - 'A' + 10; - } else if ('a' <= *h && *h <= 'f') { - b += *h - 'a' + 10; + uint32_t b = 0; + int i; + + // TBD this can be made faster with a table + for (i = 0; i < 4; i++, h++) { + b = b << 4; + if ('0' <= *h && *h <= '9') { + b += *h - '0'; + } else if ('A' <= *h && *h <= 'F') { + b += *h - 'A' + 10; + } else if ('a' <= *h && *h <= 'f') { + b += *h - 'a' + 10; + } else { + pi->s = h; + raise_error("invalid hex character", pi->str, pi->s); + } + } + return b; +} + +static char* +unicode_to_chars(ParseInfo pi, char *t, uint32_t code) { + if (0x0000007F >= code) { + *t = (char)code; + } else if (0x000007FF >= code) { + *t++ = 0xC0 | (code >> 6); + *t = 0x80 | (0x3F & code); + } else if (0x0000FFFF >= code) { + *t++ = 0xE0 | (code >> 12); + *t++ = 0x80 | ((code >> 6) & 0x3F); + *t = 0x80 | (0x3F & code); + } else if (0x001FFFFF >= code) { + *t++ = 0xF0 | (code >> 18); + *t++ = 0x80 | ((code >> 12) & 0x3F); + *t++ = 0x80 | ((code >> 6) & 0x3F); + *t = 0x80 | (0x3F & code); + } else if (0x03FFFFFF >= code) { + *t++ = 0xF8 | (code >> 24); + *t++ = 0x80 | ((code >> 18) & 0x3F); + *t++ = 0x80 | ((code >> 12) & 0x3F); + *t++ = 0x80 | ((code >> 6) & 0x3F); + *t = 0x80 | (0x3F & code); + } else if (0x7FFFFFFF >= code) { + *t++ = 0xFC | (code >> 30); + *t++ = 0x80 | ((code >> 24) & 0x3F); + *t++ = 0x80 | ((code >> 18) & 0x3F); + *t++ = 0x80 | ((code >> 12) & 0x3F); + *t++ = 0x80 | ((code >> 6) & 0x3F); + *t = 0x80 | (0x3F & code); } else { - pi->s = h; - raise_error("invalid hex character", pi->str, pi->s); + raise_error("invalid Unicode", pi->str, pi->s); } - return (char)b; + return t; } /* Assume the value starts immediately and goes until the quote character is @@ -838,6 +850,7 @@ read_quoted_value(ParseInfo pi) { char *value = 0; char *h = pi->s; // head char *t = h; // tail + uint32_t code; h++; // skip quote character t++; @@ -859,13 +872,24 @@ read_quoted_value(ParseInfo pi) { case '\\': *t = '\\'; break; case 'u': h++; - *t = read_hex(pi, h); - h += 2; - if ('\0' != *t) { - t++; + code = read_hex(pi, h); + h += 3; + if (0x0000D800 <= code && code <= 0x0000DFFF) { + uint32_t c1 = (code - 0x0000D800) & 0x000003FF; + uint32_t c2; + + h++; + if ('\\' != *h || 'u' != *(h + 1)) { + pi->s = h; + raise_error("invalid escaped character", pi->str, pi->s); + } + h += 2; + c2 = read_hex(pi, h); + h += 3; + c2 = (c2 - 0x0000DC00) & 0x000003FF; + code = ((c1 << 10) | c2) + 0x00010000; } - *t = read_hex(pi, h); - h++; + t = unicode_to_chars(pi, t, code); break; default: pi->s = h; @@ -897,12 +921,6 @@ oj_parse(char *json, Options options) { if (Yes == options->circular) { pi.circ_array = circ_array_new(); } -#ifdef HAVE_RUBY_ENCODING_H - pi.encoding = options->encoding; - //pi.encoding = ('\0' == *options->encoding) ? 0 : rb_enc_find(options->encoding); -#else - pi.encoding = 0; -#endif pi.options = options; obj = read_next(&pi, 0); if (Yes == options->circular) { diff --git a/ext/oj/oj.c b/ext/oj/oj.c index b8575209..d5b16eda 100644 --- a/ext/oj/oj.c +++ b/ext/oj/oj.c @@ -52,7 +52,6 @@ void Init_oj(); VALUE Oj = Qnil; ID oj_as_json_id; -ID oj_at_id; ID oj_fileno_id; ID oj_instance_variables_id; ID oj_json_create_id; @@ -61,9 +60,6 @@ ID oj_string_id; ID oj_to_hash_id; ID oj_to_json_id; ID oj_to_sym_id; -ID oj_tv_nsec_id; -ID oj_tv_sec_id; -ID oj_tv_usec_id; ID oj_write_id; VALUE oj_bag_class; @@ -78,7 +74,6 @@ static VALUE ascii_only_sym; static VALUE auto_define_sym; static VALUE circular_sym; static VALUE compat_sym; -static VALUE encoding_sym; static VALUE indent_sym; static VALUE mode_sym; static VALUE null_sym; @@ -99,8 +94,11 @@ static VALUE keep = Qnil; Cache oj_class_cache = 0; Cache oj_attr_cache = 0; +#ifdef HAVE_RUBY_ENCODING_H +rb_encoding *oj_utf8_encoding = 0; +#endif + struct _Options oj_default_options = { - 0, // encoding 0, // indent No, // circular Yes, // auto_define @@ -116,7 +114,6 @@ static VALUE define_mimic_json(VALUE self); * * Returns the default load and dump options as a Hash. The options are * - indent: [Fixnum] number of spaces to indent each element in an JSON document - * - encoding: [String|Encoding] character encoding for the JSON coument * - circular: [true|false|nil] support circular references while dumping * - auto_define: [true|false|nil] automatically define classes if they do not exist * - symbol_keys: [true|false|nil] use symbols instead of strings for hash keys @@ -127,9 +124,6 @@ static VALUE get_def_opts(VALUE self) { VALUE opts = rb_hash_new(); -#ifdef HAVE_RUBY_ENCODING_H - rb_hash_aset(opts, encoding_sym, (0 == oj_default_options.encoding) ? Qnil : rb_enc_from_encoding(oj_default_options.encoding)); -#endif rb_hash_aset(opts, indent_sym, INT2FIX(oj_default_options.indent)); rb_hash_aset(opts, circular_sym, (Yes == oj_default_options.circular) ? Qtrue : ((No == oj_default_options.circular) ? Qfalse : Qnil)); rb_hash_aset(opts, auto_define_sym, (Yes == oj_default_options.auto_define) ? Qtrue : ((No == oj_default_options.auto_define) ? Qfalse : Qnil)); @@ -150,7 +144,6 @@ get_def_opts(VALUE self) { * Sets the default options for load and dump. * @param [Hash] opts options to change * @param [Fixnum] :indent number of spaces to indent each element in an JSON document - * @param [String] :encoding character encoding for the JSON file * @param [true|false|nil] :circular support circular references while dumping * @param [true|false|nil] :auto_define automatically define classes if they do not exist * @param [true|false|nil] :symbol_keys convert hash keys to symbols @@ -177,21 +170,6 @@ set_def_opts(VALUE self, VALUE opts) { VALUE v; Check_Type(opts, T_HASH); - -#ifdef HAVE_RUBY_ENCODING_H - if (Qtrue == rb_funcall(opts, rb_intern("has_key?"), 1, encoding_sym)) { - v = rb_hash_lookup(opts, encoding_sym); - if (Qnil == v) { - oj_default_options.encoding = 0; - } else if (T_STRING == rb_type(v)) { - oj_default_options.encoding = rb_enc_find(StringValuePtr(v)); - } else if (rb_cEncoding == rb_obj_class(v)) { - oj_default_options.encoding = rb_to_encoding(v); - } else { - rb_raise(rb_eArgError, ":encoding must be nil, a String, or an Encoding.\n"); - } - } -#endif v = rb_hash_aref(opts, indent_sym); if (Qnil != v) { Check_Type(v, T_FIXNUM); @@ -250,17 +228,6 @@ parse_options(VALUE ropts, Options copts) { } copts->indent = NUM2INT(v); } -#ifdef HAVE_RUBY_ENCODING_H - if (Qnil != (v = rb_hash_lookup(ropts, encoding_sym))) { - if (T_STRING == rb_type(v)) { - oj_default_options.encoding = rb_enc_find(StringValuePtr(v)); - } else if (rb_cEncoding == rb_obj_class(v)) { - oj_default_options.encoding = rb_to_encoding(v); - } else { - rb_raise(rb_eArgError, ":encoding must be nil, a String, or an Encoding.\n"); - } - } -#endif if (Qnil != (v = rb_hash_lookup(ropts, mode_sym))) { if (object_sym == v) { copts->mode = ObjectMode; @@ -430,9 +397,7 @@ dump(int argc, VALUE *argv, VALUE self) { } rstr = rb_str_new2(json); #ifdef HAVE_RUBY_ENCODING_H - if (0 != copts.encoding) { - rb_enc_associate(rstr, copts.encoding); - } + rb_enc_associate(rstr, oj_utf8_encoding); #endif xfree(json); @@ -475,9 +440,7 @@ mimic_dump(int argc, VALUE *argv, VALUE self) { } rstr = rb_str_new2(json); #ifdef ENCODING_INLINE_MAX - if (0 != copts.encoding) { - rb_enc_associate(rstr, copts.encoding); - } + rb_enc_associate(rstr, oj_utf8_encoding); #endif if (2 <= argc && Qnil != argv[1]) { VALUE io = argv[1]; @@ -615,9 +578,7 @@ mimic_generate_core(int argc, VALUE *argv, Options copts) { } rstr = rb_str_new2(json); #ifdef ENCODING_INLINE_MAX - if (0 != copts->encoding) { - rb_enc_associate(rstr, copts->encoding); - } + rb_enc_associate(rstr, oj_utf8_encoding); #endif xfree(json); @@ -755,7 +716,6 @@ void Init_oj() { rb_define_module_function(Oj, "to_file", to_file, -1); oj_as_json_id = rb_intern("as_json"); - oj_at_id = rb_intern("at"); oj_fileno_id = rb_intern("fileno"); oj_instance_variables_id = rb_intern("instance_variables"); oj_json_create_id = rb_intern("json_create"); @@ -764,9 +724,6 @@ void Init_oj() { oj_to_hash_id = rb_intern("to_hash"); oj_to_json_id = rb_intern("to_json"); oj_to_sym_id = rb_intern("to_sym"); - oj_tv_nsec_id = rb_intern("tv_nsec"); - oj_tv_sec_id = rb_intern("tv_sec"); - oj_tv_usec_id = rb_intern("tv_usec"); oj_write_id = rb_intern("write"); oj_bag_class = rb_const_get_at(Oj, rb_intern("Bag")); @@ -779,7 +736,6 @@ void Init_oj() { auto_define_sym = ID2SYM(rb_intern("auto_define")); rb_ary_push(keep, auto_define_sym); circular_sym = ID2SYM(rb_intern("circular")); rb_ary_push(keep, circular_sym); compat_sym = ID2SYM(rb_intern("compat")); rb_ary_push(keep, compat_sym); - encoding_sym = ID2SYM(rb_intern("encoding")); rb_ary_push(keep, encoding_sym); indent_sym = ID2SYM(rb_intern("indent")); rb_ary_push(keep, indent_sym); mode_sym = ID2SYM(rb_intern("mode")); rb_ary_push(keep, mode_sym); symbol_keys_sym = ID2SYM(rb_intern("symbol_keys")); rb_ary_push(keep, symbol_keys_sym); @@ -791,7 +747,7 @@ void Init_oj() { oj_default_options.mode = ObjectMode; #ifdef HAVE_RUBY_ENCODING_H - oj_default_options.encoding = rb_enc_find("UTF-8"); + oj_utf8_encoding = rb_enc_find("UTF-8"); #endif oj_cache_new(&oj_class_cache); diff --git a/ext/oj/oj.h b/ext/oj/oj.h index b35df7bd..c13714a2 100644 --- a/ext/oj/oj.h +++ b/ext/oj/oj.h @@ -99,12 +99,6 @@ typedef struct _DumpOpts { } *DumpOpts; typedef struct _Options { -#ifdef HAVE_RUBY_ENCODING_H - rb_encoding *encoding; -#else - void *encoding; -#endif - //char encoding[64]; // encoding, stored in the option to avoid GC invalidation in default values int indent; // indention for dump, default 2 char circular; // YesNo char auto_define; // YesNo @@ -148,6 +142,9 @@ extern void oj_init_doc(void); extern VALUE Oj; extern struct _Options oj_default_options; +#ifdef HAVE_RUBY_ENCODING_H +extern rb_encoding *oj_utf8_encoding; +#endif extern VALUE oj_bag_class; extern VALUE oj_date_class; @@ -159,16 +156,12 @@ extern VALUE oj_time_class; extern VALUE oj_slash_string; extern ID oj_as_json_id; -extern ID oj_at_id; extern ID oj_instance_variables_id; extern ID oj_json_create_id; extern ID oj_string_id; extern ID oj_to_hash_id; extern ID oj_to_json_id; extern ID oj_to_sym_id; -extern ID oj_tv_nsec_id; -extern ID oj_tv_sec_id; -extern ID oj_tv_usec_id; extern Cache oj_class_cache; extern Cache oj_attr_cache;