Add an option to ignore UTF-8 encoding errors

By default Jiffy is quite strict in what it encodes. By default it will not allow invalid UTF-8 to be produced. This can cause issues when attempting to encode JSON that was decoded by other libraries as UTF-8 semantics are not uniformly enforced. This patch adds an option 'force_utf8' to the encoder. If encoding hits an error for an invalid string it will forcefully mutate the object to contain only valid UTF-8 and return the resulting encoded JSON. For the most part this means it will strip any garbage data from binaries replacing it replacement codepoint U+FFFD. Although, it will also try and the common error of encoding surrogate pairs as three-byte sequences and reencode them into UTF-8 properly.
dch · Jun 1, 2012 · 414827d · 414827d
1 parent 6f589d4
commit 414827d
Show file tree

Hide file tree

Showing 6 changed files with 148 additions and 31 deletions.
diff --git a/c_src/encoder.c b/c_src/encoder.c
@@ -81,6 +81,8 @@ enc_init(Encoder* e, ErlNifEnv* env, ERL_NIF_TERM opts, ErlNifBinary* bin)
             e->uescape = 1;
         } else if(enif_compare(val, e->atoms->atom_pretty) == 0) {
             e->pretty = 1;
+        } else if(enif_compare(val, e->atoms->atom_force_utf8) == 0) {
+            // Ignore, handled in Erlang
         } else {
             return 0;
         }

diff --git a/c_src/jiffy.c b/c_src/jiffy.c
@@ -22,6 +22,7 @@ load(ErlNifEnv* env, void** priv, ERL_NIF_TERM info)
     st->atom_partial = make_atom(env, "partial");
     st->atom_uescape = make_atom(env, "uescape");
     st->atom_pretty = make_atom(env, "pretty");
+    st->atom_force_utf8 = make_atom(env, "force_utf8");
 
     // Markers used in encoding
     st->ref_object = make_atom(env, "$object_ref$");

diff --git a/c_src/jiffy.h b/c_src/jiffy.h
@@ -18,6 +18,7 @@ typedef struct {
     ERL_NIF_TERM    atom_partial;
     ERL_NIF_TERM    atom_uescape;
     ERL_NIF_TERM    atom_pretty;
+    ERL_NIF_TERM    atom_force_utf8;
 
     ERL_NIF_TERM    ref_object;
     ERL_NIF_TERM    ref_array;

diff --git a/src/jiffy.erl b/src/jiffy.erl
@@ -25,7 +25,11 @@ encode(Data) ->
 
 
 encode(Data, Options) ->
+    ForceUTF8 = lists:member(force_utf8, Options),
     case nif_encode(Data, Options) of
+        {error, invalid_string} when ForceUTF8 == true ->
+            FixedData = jiffy_utf8:fix(Data),
+            encode(FixedData, Options -- [force_utf8]);
         {error, _} = Error ->
             throw(Error);
         {partial, IOData} ->

diff --git a/src/jiffy_utf8.erl b/src/jiffy_utf8.erl
@@ -0,0 +1,104 @@
+% This file is part of Jiffy released under the MIT license.
+% See the LICENSE file for more information.
+
+-module(jiffy_utf8).
+-export([fix/1]).
+
+
+fix({Props}) ->
+    fix_props(Props, []);
+fix(Values) when is_list(Values) ->
+    fix_array(Values, []);
+fix(Bin) when is_binary(Bin) ->
+    fix_bin(Bin);
+fix(Val) ->
+    Val.
+
+
+fix_props([], Acc) ->
+    {lists:reverse(Acc)};
+fix_props([{K0, V0} | Rest], Acc) ->
+    K = fix(K0),
+    V = fix(V0),
+    fix_props(Rest, [{K, V} | Acc]).
+
+
+fix_array([], Acc) ->
+    lists:reverse(Acc);
+fix_array([Val | Rest], Acc0) ->
+    Acc = [fix(Val) | Acc0],
+    fix_array(Rest, Acc).
+
+
+fix_bin(Bin) ->
+    Dec0 = loose_decode(Bin, 0, []),
+    Dec1 = try_combining(Dec0, []),
+    Dec2 = replace_garbage(Dec1, []),
+    list_to_binary(xmerl_ucs:to_utf8(Dec2)).
+
+
+loose_decode(Bin, O, Acc) ->
+    case Bin of
+        <<_:O/binary>> ->
+            lists:reverse(Acc);
+        <<_:O/binary, 0:1/integer, V:7/integer, _/binary>> ->
+            loose_decode(Bin, O+1, [V | Acc]);
+        <<_:O/binary, 6:3/integer, V0:5/integer,
+                2:2/integer, V1:6/integer, _/binary>> ->
+            B = <<0:5/integer, V0:5/integer, V1:6/integer>>,
+            <<V:16/integer>> = B,
+            loose_decode(Bin, O+2, [V | Acc]);
+        <<_:O/binary, 14:4/integer, V0:4/integer,
+                2:2/integer, V1:6/integer,
+                2:2/integer, V2:6/integer, _/binary>> ->
+            B = <<V0:4/integer, V1:6/integer, V2:6/integer>>,
+            <<V:16/integer>> = B,
+            loose_decode(Bin, O+3, [V | Acc]);
+        <<_:O/binary, 30:5/integer, V0:3/integer,
+                2:2/integer, V1:6/integer,
+                2:2/integer, V2:6/integer,
+                2:2/integer, V3:6/integer, _/binary>> ->
+            B = <<0:11/integer, V0:3/integer, V1:6/integer,
+                    V2:6/integer, V3:6/integer>>,
+            <<V:32/integer>> = B,
+            loose_decode(Bin, O+4, [V | Acc]);
+        <<_:O/binary, _:8/integer, R/binary>> ->
+            % Broken lead or continuation byte. Discard first
+            % byte and all broken continuations. Replace the
+            % whole mess with a replacment code point.
+            T = 1 + count_continuation_bytes(R, 0),
+            loose_decode(Bin, O+T, [16#FFFD | Acc])
+    end.
+
+
+count_continuation_bytes(R, O) ->
+    case R of
+        <<_:O/binary, 2:2/integer, _:6/integer, _/binary>> ->
+            count_continuation_bytes(R, O+1);
+        _ ->
+            O
+    end.
+
+
+try_combining([], Acc) ->
+    lists:reverse(Acc);
+try_combining([H, L | Rest], Acc) when H >= 16#D800, H =< 16#DFFF,
+                                        L >= 16#D800, L =< 16#DFFF ->
+    Bin = <<H:16/big-unsigned-integer, L:16/big-unsigned-integer>>,
+    try
+        [C] = xmerl_ucs:from_utf16be(Bin),
+        try_combining(Rest, [C | Acc])
+    catch _:_ ->
+        try_combining(Rest, [L, H | Acc])
+    end;
+try_combining([C | Rest], Acc) ->
+    try_combining(Rest, [C | Acc]).
+
+
+replace_garbage([], Acc) ->
+    lists:reverse(Acc);
+replace_garbage([C | Rest], Acc) ->
+    case xmerl_ucs:is_unicode(C) of
+        true -> replace_garbage(Rest, [C | Acc]);
+        false -> replace_garbage(Rest, [16#FFFD | Acc])
+    end.
diff --git a/test/004-strings.t b/test/004-strings.t
@@ -6,7 +6,7 @@ main([]) ->
     code:add_pathz("ebin"),
     code:add_pathz("test"),
 
-    etap:plan(87),
+    etap:plan(116),
     util:test_good(good()),
     util:test_good(uescaped(), [uescape]),
     util:test_errors(errors()),
@@ -61,12 +61,17 @@ errors() ->
 
 test_utf8([]) ->
     ok;
-test_utf8([Case | Rest]) ->
+test_utf8([{Case, Fixed} | Rest]) ->
     etap:fun_is(
         fun({error, invalid_string}) -> true; (Else) -> Else end,
         (catch jiffy:encode(Case)),
         lists:flatten(io_lib:format("Invalid utf-8: ~p", [Case]))
     ),
+    etap:fun_is(
+        fun(Fixed) -> true; (Else) -> Else end,
+        jiffy:encode(Case, [force_utf8]),
+        lists:flatten(io_lib:format("Fixed correctly: ~p", [Fixed]))
+    ),
     Case2 = <<34, Case/binary, 34>>,
     etap:fun_is(
         fun({error, {_, invalid_string}}) -> true; (Else) -> Else end,
@@ -78,47 +83,47 @@ test_utf8([Case | Rest]) ->
 utf8_cases() ->
     [
         % Stray continuation byte
-        <<16#C2, 16#81, 16#80>>,
-        <<"foo", 16#80, "bar">>,
+        {<<16#C2, 16#81, 16#80>>, <<16#C2, 16#81, 16#EF, 16#BF, 16#BD>>},
+        {<<"foo", 16#80, "bar">>, <<"foo", 16#EF, 16#BF, 16#BD, "bar">>},
 
         % Invalid Unicode code points
-        <<239, 191, 190>>,
-        <<237, 160, 129>>,
+        {<<239, 191, 190>>, <<16#EF, 16#BF, 16#BD>>},
+        {<<237, 160, 129>>, <<16#EF, 16#BF, 16#BD>>},
 
         % Not enough extension bytes
-        <<16#C0>>,
+        {<<16#C0>>, <<16#EF, 16#BF, 16#BD>>},
 
-        <<16#E0>>,
-        <<16#E0, 16#80>>,
+        {<<16#E0>>, <<16#EF, 16#BF, 16#BD>>},
+        {<<16#E0, 16#80>>, <<16#EF, 16#BF, 16#BD>>},
 
-        <<16#F0>>,
-        <<16#F0, 16#80>>,
-        <<16#F0, 16#80, 16#80>>,
+        {<<16#F0>>, <<16#EF, 16#BF, 16#BD>>},
+        {<<16#F0, 16#80>>, <<16#EF, 16#BF, 16#BD>>},
+        {<<16#F0, 16#80, 16#80>>, <<16#EF, 16#BF, 16#BD>>},
 
-        <<16#F8>>,
-        <<16#F8, 16#80>>,
-        <<16#F8, 16#80, 16#80>>,
-        <<16#F8, 16#80, 16#80, 16#80>>,
+        {<<16#F8>>, <<16#EF, 16#BF, 16#BD>>},
+        {<<16#F8, 16#80>>, <<16#EF, 16#BF, 16#BD>>},
+        {<<16#F8, 16#80, 16#80>>, <<16#EF, 16#BF, 16#BD>>},
+        {<<16#F8, 16#80, 16#80, 16#80>>, <<16#EF, 16#BF, 16#BD>>},
 
-        <<16#FC>>,
-        <<16#FC, 16#80>>,
-        <<16#FC, 16#80, 16#80>>,
-        <<16#FC, 16#80, 16#80, 16#80>>,
-        <<16#FC, 16#80, 16#80, 16#80, 16#80>>,
+        {<<16#FC>>, <<16#EF, 16#BF, 16#BD>>},
+        {<<16#FC, 16#80>>, <<16#EF, 16#BF, 16#BD>>},
+        {<<16#FC, 16#80, 16#80>>, <<16#EF, 16#BF, 16#BD>>},
+        {<<16#FC, 16#80, 16#80, 16#80>>, <<16#EF, 16#BF, 16#BD>>},
+        {<<16#FC, 16#80, 16#80, 16#80, 16#80>>, <<16#EF, 16#BF, 16#BD>>},
 
         % No data in high bits.
-        <<16#C0, 16#80>>,
-        <<16#C1, 16#80>>,
+        {<<16#C0, 16#80>>, <<"\"\\u0000\"">>},
+        {<<16#C1, 16#80>>, <<"\"\\u0000\"">>},
 
-        <<16#E0, 16#80, 16#80>>,
-        <<16#E0, 16#90, 16#80>>,
+        {<<16#E0, 16#80, 16#80>>, <<"\"\\u0000\"">>},
+        {<<16#E0, 16#90, 16#80>>, <<"\"\\u0000\"">>},
 
-        <<16#F0, 16#80, 16#80, 16#80>>,
-        <<16#F0, 16#88, 16#80, 16#80>>,
+        {<<16#F0, 16#80, 16#80, 16#80>>, <<"\"\\u0000\"">>},
+        {<<16#F0, 16#88, 16#80, 16#80>>, <<"\"\\u0000\"">>},
 
-        <<16#F8, 16#80, 16#80, 16#80, 16#80>>,
-        <<16#F8, 16#84, 16#80, 16#80, 16#80>>,
+        {<<16#F8, 16#80, 16#80, 16#80, 16#80>>, <<"\"\\u0000\"">>},
+        {<<16#F8, 16#84, 16#80, 16#80, 16#80>>, <<"\"\\u0000\"">>},
 
-        <<16#FC, 16#80, 16#80, 16#80, 16#80, 16#80>>,
-        <<16#FC, 16#82, 16#80, 16#80, 16#80, 16#80>>
+        {<<16#FC, 16#80, 16#80, 16#80, 16#80, 16#80>>, <<"\"\\u0000\"">>},
+        {<<16#FC, 16#82, 16#80, 16#80, 16#80, 16#80>>, <<"\"\\u0000\"">>}
     ].