diff --git a/c_src/encoder.c b/c_src/encoder.c index f063f77c..1b4baaf0 100644 --- a/c_src/encoder.c +++ b/c_src/encoder.c @@ -81,6 +81,8 @@ enc_init(Encoder* e, ErlNifEnv* env, ERL_NIF_TERM opts, ErlNifBinary* bin) e->uescape = 1; } else if(enif_compare(val, e->atoms->atom_pretty) == 0) { e->pretty = 1; + } else if(enif_compare(val, e->atoms->atom_force_utf8) == 0) { + // Ignore, handled in Erlang } else { return 0; } diff --git a/c_src/jiffy.c b/c_src/jiffy.c index 8fdde2f4..3f64fe56 100644 --- a/c_src/jiffy.c +++ b/c_src/jiffy.c @@ -22,6 +22,7 @@ load(ErlNifEnv* env, void** priv, ERL_NIF_TERM info) st->atom_partial = make_atom(env, "partial"); st->atom_uescape = make_atom(env, "uescape"); st->atom_pretty = make_atom(env, "pretty"); + st->atom_force_utf8 = make_atom(env, "force_utf8"); // Markers used in encoding st->ref_object = make_atom(env, "$object_ref$"); diff --git a/c_src/jiffy.h b/c_src/jiffy.h index 327657aa..c477a439 100644 --- a/c_src/jiffy.h +++ b/c_src/jiffy.h @@ -18,6 +18,7 @@ typedef struct { ERL_NIF_TERM atom_partial; ERL_NIF_TERM atom_uescape; ERL_NIF_TERM atom_pretty; + ERL_NIF_TERM atom_force_utf8; ERL_NIF_TERM ref_object; ERL_NIF_TERM ref_array; diff --git a/src/jiffy.erl b/src/jiffy.erl index b31a5263..c4b3d696 100644 --- a/src/jiffy.erl +++ b/src/jiffy.erl @@ -25,7 +25,11 @@ encode(Data) -> encode(Data, Options) -> + ForceUTF8 = lists:member(force_utf8, Options), case nif_encode(Data, Options) of + {error, invalid_string} when ForceUTF8 == true -> + FixedData = jiffy_utf8:fix(Data), + encode(FixedData, Options -- [force_utf8]); {error, _} = Error -> throw(Error); {partial, IOData} -> diff --git a/src/jiffy_utf8.erl b/src/jiffy_utf8.erl new file mode 100644 index 00000000..ee937fe4 --- /dev/null +++ b/src/jiffy_utf8.erl @@ -0,0 +1,104 @@ +% This file is part of Jiffy released under the MIT license. +% See the LICENSE file for more information. + +-module(jiffy_utf8). +-export([fix/1]). + + +fix({Props}) -> + fix_props(Props, []); +fix(Values) when is_list(Values) -> + fix_array(Values, []); +fix(Bin) when is_binary(Bin) -> + fix_bin(Bin); +fix(Val) -> + Val. + + +fix_props([], Acc) -> + {lists:reverse(Acc)}; +fix_props([{K0, V0} | Rest], Acc) -> + K = fix(K0), + V = fix(V0), + fix_props(Rest, [{K, V} | Acc]). + + +fix_array([], Acc) -> + lists:reverse(Acc); +fix_array([Val | Rest], Acc0) -> + Acc = [fix(Val) | Acc0], + fix_array(Rest, Acc). + + +fix_bin(Bin) -> + Dec0 = loose_decode(Bin, 0, []), + Dec1 = try_combining(Dec0, []), + Dec2 = replace_garbage(Dec1, []), + list_to_binary(xmerl_ucs:to_utf8(Dec2)). + + +loose_decode(Bin, O, Acc) -> + case Bin of + <<_:O/binary>> -> + lists:reverse(Acc); + <<_:O/binary, 0:1/integer, V:7/integer, _/binary>> -> + loose_decode(Bin, O+1, [V | Acc]); + <<_:O/binary, 6:3/integer, V0:5/integer, + 2:2/integer, V1:6/integer, _/binary>> -> + B = <<0:5/integer, V0:5/integer, V1:6/integer>>, + <> = B, + loose_decode(Bin, O+2, [V | Acc]); + <<_:O/binary, 14:4/integer, V0:4/integer, + 2:2/integer, V1:6/integer, + 2:2/integer, V2:6/integer, _/binary>> -> + B = <>, + <> = B, + loose_decode(Bin, O+3, [V | Acc]); + <<_:O/binary, 30:5/integer, V0:3/integer, + 2:2/integer, V1:6/integer, + 2:2/integer, V2:6/integer, + 2:2/integer, V3:6/integer, _/binary>> -> + B = <<0:11/integer, V0:3/integer, V1:6/integer, + V2:6/integer, V3:6/integer>>, + <> = B, + loose_decode(Bin, O+4, [V | Acc]); + <<_:O/binary, _:8/integer, R/binary>> -> + % Broken lead or continuation byte. Discard first + % byte and all broken continuations. Replace the + % whole mess with a replacment code point. + T = 1 + count_continuation_bytes(R, 0), + loose_decode(Bin, O+T, [16#FFFD | Acc]) + end. + + +count_continuation_bytes(R, O) -> + case R of + <<_:O/binary, 2:2/integer, _:6/integer, _/binary>> -> + count_continuation_bytes(R, O+1); + _ -> + O + end. + + +try_combining([], Acc) -> + lists:reverse(Acc); +try_combining([H, L | Rest], Acc) when H >= 16#D800, H =< 16#DFFF, + L >= 16#D800, L =< 16#DFFF -> + Bin = <>, + try + [C] = xmerl_ucs:from_utf16be(Bin), + try_combining(Rest, [C | Acc]) + catch _:_ -> + try_combining(Rest, [L, H | Acc]) + end; +try_combining([C | Rest], Acc) -> + try_combining(Rest, [C | Acc]). + + +replace_garbage([], Acc) -> + lists:reverse(Acc); +replace_garbage([C | Rest], Acc) -> + case xmerl_ucs:is_unicode(C) of + true -> replace_garbage(Rest, [C | Acc]); + false -> replace_garbage(Rest, [16#FFFD | Acc]) + end. diff --git a/test/004-strings.t b/test/004-strings.t index 99852a3e..17fb2b59 100755 --- a/test/004-strings.t +++ b/test/004-strings.t @@ -6,7 +6,7 @@ main([]) -> code:add_pathz("ebin"), code:add_pathz("test"), - etap:plan(87), + etap:plan(116), util:test_good(good()), util:test_good(uescaped(), [uescape]), util:test_errors(errors()), @@ -61,12 +61,17 @@ errors() -> test_utf8([]) -> ok; -test_utf8([Case | Rest]) -> +test_utf8([{Case, Fixed} | Rest]) -> etap:fun_is( fun({error, invalid_string}) -> true; (Else) -> Else end, (catch jiffy:encode(Case)), lists:flatten(io_lib:format("Invalid utf-8: ~p", [Case])) ), + etap:fun_is( + fun(Fixed) -> true; (Else) -> Else end, + jiffy:encode(Case, [force_utf8]), + lists:flatten(io_lib:format("Fixed correctly: ~p", [Fixed])) + ), Case2 = <<34, Case/binary, 34>>, etap:fun_is( fun({error, {_, invalid_string}}) -> true; (Else) -> Else end, @@ -78,47 +83,47 @@ test_utf8([Case | Rest]) -> utf8_cases() -> [ % Stray continuation byte - <<16#C2, 16#81, 16#80>>, - <<"foo", 16#80, "bar">>, + {<<16#C2, 16#81, 16#80>>, <<16#C2, 16#81, 16#EF, 16#BF, 16#BD>>}, + {<<"foo", 16#80, "bar">>, <<"foo", 16#EF, 16#BF, 16#BD, "bar">>}, % Invalid Unicode code points - <<239, 191, 190>>, - <<237, 160, 129>>, + {<<239, 191, 190>>, <<16#EF, 16#BF, 16#BD>>}, + {<<237, 160, 129>>, <<16#EF, 16#BF, 16#BD>>}, % Not enough extension bytes - <<16#C0>>, + {<<16#C0>>, <<16#EF, 16#BF, 16#BD>>}, - <<16#E0>>, - <<16#E0, 16#80>>, + {<<16#E0>>, <<16#EF, 16#BF, 16#BD>>}, + {<<16#E0, 16#80>>, <<16#EF, 16#BF, 16#BD>>}, - <<16#F0>>, - <<16#F0, 16#80>>, - <<16#F0, 16#80, 16#80>>, + {<<16#F0>>, <<16#EF, 16#BF, 16#BD>>}, + {<<16#F0, 16#80>>, <<16#EF, 16#BF, 16#BD>>}, + {<<16#F0, 16#80, 16#80>>, <<16#EF, 16#BF, 16#BD>>}, - <<16#F8>>, - <<16#F8, 16#80>>, - <<16#F8, 16#80, 16#80>>, - <<16#F8, 16#80, 16#80, 16#80>>, + {<<16#F8>>, <<16#EF, 16#BF, 16#BD>>}, + {<<16#F8, 16#80>>, <<16#EF, 16#BF, 16#BD>>}, + {<<16#F8, 16#80, 16#80>>, <<16#EF, 16#BF, 16#BD>>}, + {<<16#F8, 16#80, 16#80, 16#80>>, <<16#EF, 16#BF, 16#BD>>}, - <<16#FC>>, - <<16#FC, 16#80>>, - <<16#FC, 16#80, 16#80>>, - <<16#FC, 16#80, 16#80, 16#80>>, - <<16#FC, 16#80, 16#80, 16#80, 16#80>>, + {<<16#FC>>, <<16#EF, 16#BF, 16#BD>>}, + {<<16#FC, 16#80>>, <<16#EF, 16#BF, 16#BD>>}, + {<<16#FC, 16#80, 16#80>>, <<16#EF, 16#BF, 16#BD>>}, + {<<16#FC, 16#80, 16#80, 16#80>>, <<16#EF, 16#BF, 16#BD>>}, + {<<16#FC, 16#80, 16#80, 16#80, 16#80>>, <<16#EF, 16#BF, 16#BD>>}, % No data in high bits. - <<16#C0, 16#80>>, - <<16#C1, 16#80>>, + {<<16#C0, 16#80>>, <<"\"\\u0000\"">>}, + {<<16#C1, 16#80>>, <<"\"\\u0000\"">>}, - <<16#E0, 16#80, 16#80>>, - <<16#E0, 16#90, 16#80>>, + {<<16#E0, 16#80, 16#80>>, <<"\"\\u0000\"">>}, + {<<16#E0, 16#90, 16#80>>, <<"\"\\u0000\"">>}, - <<16#F0, 16#80, 16#80, 16#80>>, - <<16#F0, 16#88, 16#80, 16#80>>, + {<<16#F0, 16#80, 16#80, 16#80>>, <<"\"\\u0000\"">>}, + {<<16#F0, 16#88, 16#80, 16#80>>, <<"\"\\u0000\"">>}, - <<16#F8, 16#80, 16#80, 16#80, 16#80>>, - <<16#F8, 16#84, 16#80, 16#80, 16#80>>, + {<<16#F8, 16#80, 16#80, 16#80, 16#80>>, <<"\"\\u0000\"">>}, + {<<16#F8, 16#84, 16#80, 16#80, 16#80>>, <<"\"\\u0000\"">>}, - <<16#FC, 16#80, 16#80, 16#80, 16#80, 16#80>>, - <<16#FC, 16#82, 16#80, 16#80, 16#80, 16#80>> + {<<16#FC, 16#80, 16#80, 16#80, 16#80, 16#80>>, <<"\"\\u0000\"">>}, + {<<16#FC, 16#82, 16#80, 16#80, 16#80, 16#80>>, <<"\"\\u0000\"">>} ].