Skip to content

Commit

Permalink
Add an option to ignore UTF-8 encoding errors
Browse files Browse the repository at this point in the history
By default Jiffy is quite strict in what it encodes. By default it will
not allow invalid UTF-8 to be produced. This can cause issues when
attempting to encode JSON that was decoded by other libraries as UTF-8
semantics are not uniformly enforced.

This patch adds an option 'force_utf8' to the encoder. If encoding hits
an error for an invalid string it will forcefully mutate the object to
contain only valid UTF-8 and return the resulting encoded JSON.

For the most part this means it will strip any garbage data from
binaries replacing it replacement codepoint U+FFFD. Although, it will
also try and the common error of encoding surrogate pairs as three-byte
sequences and reencode them into UTF-8 properly.
  • Loading branch information
davisp committed Jun 1, 2012
1 parent 6f589d4 commit 414827d
Show file tree
Hide file tree
Showing 6 changed files with 148 additions and 31 deletions.
2 changes: 2 additions & 0 deletions c_src/encoder.c
Expand Up @@ -81,6 +81,8 @@ enc_init(Encoder* e, ErlNifEnv* env, ERL_NIF_TERM opts, ErlNifBinary* bin)
e->uescape = 1;
} else if(enif_compare(val, e->atoms->atom_pretty) == 0) {
e->pretty = 1;
} else if(enif_compare(val, e->atoms->atom_force_utf8) == 0) {
// Ignore, handled in Erlang
} else {
return 0;
}
Expand Down
1 change: 1 addition & 0 deletions c_src/jiffy.c
Expand Up @@ -22,6 +22,7 @@ load(ErlNifEnv* env, void** priv, ERL_NIF_TERM info)
st->atom_partial = make_atom(env, "partial");
st->atom_uescape = make_atom(env, "uescape");
st->atom_pretty = make_atom(env, "pretty");
st->atom_force_utf8 = make_atom(env, "force_utf8");

// Markers used in encoding
st->ref_object = make_atom(env, "$object_ref$");
Expand Down
1 change: 1 addition & 0 deletions c_src/jiffy.h
Expand Up @@ -18,6 +18,7 @@ typedef struct {
ERL_NIF_TERM atom_partial;
ERL_NIF_TERM atom_uescape;
ERL_NIF_TERM atom_pretty;
ERL_NIF_TERM atom_force_utf8;

ERL_NIF_TERM ref_object;
ERL_NIF_TERM ref_array;
Expand Down
4 changes: 4 additions & 0 deletions src/jiffy.erl
Expand Up @@ -25,7 +25,11 @@ encode(Data) ->


encode(Data, Options) ->
ForceUTF8 = lists:member(force_utf8, Options),
case nif_encode(Data, Options) of
{error, invalid_string} when ForceUTF8 == true ->
FixedData = jiffy_utf8:fix(Data),
encode(FixedData, Options -- [force_utf8]);
{error, _} = Error ->
throw(Error);
{partial, IOData} ->
Expand Down
104 changes: 104 additions & 0 deletions src/jiffy_utf8.erl
@@ -0,0 +1,104 @@
% This file is part of Jiffy released under the MIT license.
% See the LICENSE file for more information.

-module(jiffy_utf8).
-export([fix/1]).


fix({Props}) ->
fix_props(Props, []);
fix(Values) when is_list(Values) ->
fix_array(Values, []);
fix(Bin) when is_binary(Bin) ->
fix_bin(Bin);
fix(Val) ->
Val.


fix_props([], Acc) ->
{lists:reverse(Acc)};
fix_props([{K0, V0} | Rest], Acc) ->
K = fix(K0),
V = fix(V0),
fix_props(Rest, [{K, V} | Acc]).


fix_array([], Acc) ->
lists:reverse(Acc);
fix_array([Val | Rest], Acc0) ->
Acc = [fix(Val) | Acc0],
fix_array(Rest, Acc).


fix_bin(Bin) ->
Dec0 = loose_decode(Bin, 0, []),
Dec1 = try_combining(Dec0, []),
Dec2 = replace_garbage(Dec1, []),
list_to_binary(xmerl_ucs:to_utf8(Dec2)).


loose_decode(Bin, O, Acc) ->
case Bin of
<<_:O/binary>> ->
lists:reverse(Acc);
<<_:O/binary, 0:1/integer, V:7/integer, _/binary>> ->
loose_decode(Bin, O+1, [V | Acc]);
<<_:O/binary, 6:3/integer, V0:5/integer,
2:2/integer, V1:6/integer, _/binary>> ->
B = <<0:5/integer, V0:5/integer, V1:6/integer>>,
<<V:16/integer>> = B,
loose_decode(Bin, O+2, [V | Acc]);
<<_:O/binary, 14:4/integer, V0:4/integer,
2:2/integer, V1:6/integer,
2:2/integer, V2:6/integer, _/binary>> ->
B = <<V0:4/integer, V1:6/integer, V2:6/integer>>,
<<V:16/integer>> = B,
loose_decode(Bin, O+3, [V | Acc]);
<<_:O/binary, 30:5/integer, V0:3/integer,
2:2/integer, V1:6/integer,
2:2/integer, V2:6/integer,
2:2/integer, V3:6/integer, _/binary>> ->
B = <<0:11/integer, V0:3/integer, V1:6/integer,
V2:6/integer, V3:6/integer>>,
<<V:32/integer>> = B,
loose_decode(Bin, O+4, [V | Acc]);
<<_:O/binary, _:8/integer, R/binary>> ->
% Broken lead or continuation byte. Discard first
% byte and all broken continuations. Replace the
% whole mess with a replacment code point.
T = 1 + count_continuation_bytes(R, 0),
loose_decode(Bin, O+T, [16#FFFD | Acc])
end.


count_continuation_bytes(R, O) ->
case R of
<<_:O/binary, 2:2/integer, _:6/integer, _/binary>> ->
count_continuation_bytes(R, O+1);
_ ->
O
end.


try_combining([], Acc) ->
lists:reverse(Acc);
try_combining([H, L | Rest], Acc) when H >= 16#D800, H =< 16#DFFF,
L >= 16#D800, L =< 16#DFFF ->
Bin = <<H:16/big-unsigned-integer, L:16/big-unsigned-integer>>,
try
[C] = xmerl_ucs:from_utf16be(Bin),
try_combining(Rest, [C | Acc])
catch _:_ ->
try_combining(Rest, [L, H | Acc])
end;
try_combining([C | Rest], Acc) ->
try_combining(Rest, [C | Acc]).


replace_garbage([], Acc) ->
lists:reverse(Acc);
replace_garbage([C | Rest], Acc) ->
case xmerl_ucs:is_unicode(C) of
true -> replace_garbage(Rest, [C | Acc]);
false -> replace_garbage(Rest, [16#FFFD | Acc])
end.
67 changes: 36 additions & 31 deletions test/004-strings.t
Expand Up @@ -6,7 +6,7 @@ main([]) ->
code:add_pathz("ebin"),
code:add_pathz("test"),

etap:plan(87),
etap:plan(116),
util:test_good(good()),
util:test_good(uescaped(), [uescape]),
util:test_errors(errors()),
Expand Down Expand Up @@ -61,12 +61,17 @@ errors() ->

test_utf8([]) ->
ok;
test_utf8([Case | Rest]) ->
test_utf8([{Case, Fixed} | Rest]) ->
etap:fun_is(
fun({error, invalid_string}) -> true; (Else) -> Else end,
(catch jiffy:encode(Case)),
lists:flatten(io_lib:format("Invalid utf-8: ~p", [Case]))
),
etap:fun_is(
fun(Fixed) -> true; (Else) -> Else end,
jiffy:encode(Case, [force_utf8]),
lists:flatten(io_lib:format("Fixed correctly: ~p", [Fixed]))
),
Case2 = <<34, Case/binary, 34>>,
etap:fun_is(
fun({error, {_, invalid_string}}) -> true; (Else) -> Else end,
Expand All @@ -78,47 +83,47 @@ test_utf8([Case | Rest]) ->
utf8_cases() ->
[
% Stray continuation byte
<<16#C2, 16#81, 16#80>>,
<<"foo", 16#80, "bar">>,
{<<16#C2, 16#81, 16#80>>, <<16#C2, 16#81, 16#EF, 16#BF, 16#BD>>},
{<<"foo", 16#80, "bar">>, <<"foo", 16#EF, 16#BF, 16#BD, "bar">>},

% Invalid Unicode code points
<<239, 191, 190>>,
<<237, 160, 129>>,
{<<239, 191, 190>>, <<16#EF, 16#BF, 16#BD>>},
{<<237, 160, 129>>, <<16#EF, 16#BF, 16#BD>>},

% Not enough extension bytes
<<16#C0>>,
{<<16#C0>>, <<16#EF, 16#BF, 16#BD>>},

<<16#E0>>,
<<16#E0, 16#80>>,
{<<16#E0>>, <<16#EF, 16#BF, 16#BD>>},
{<<16#E0, 16#80>>, <<16#EF, 16#BF, 16#BD>>},

<<16#F0>>,
<<16#F0, 16#80>>,
<<16#F0, 16#80, 16#80>>,
{<<16#F0>>, <<16#EF, 16#BF, 16#BD>>},
{<<16#F0, 16#80>>, <<16#EF, 16#BF, 16#BD>>},
{<<16#F0, 16#80, 16#80>>, <<16#EF, 16#BF, 16#BD>>},

<<16#F8>>,
<<16#F8, 16#80>>,
<<16#F8, 16#80, 16#80>>,
<<16#F8, 16#80, 16#80, 16#80>>,
{<<16#F8>>, <<16#EF, 16#BF, 16#BD>>},
{<<16#F8, 16#80>>, <<16#EF, 16#BF, 16#BD>>},
{<<16#F8, 16#80, 16#80>>, <<16#EF, 16#BF, 16#BD>>},
{<<16#F8, 16#80, 16#80, 16#80>>, <<16#EF, 16#BF, 16#BD>>},

<<16#FC>>,
<<16#FC, 16#80>>,
<<16#FC, 16#80, 16#80>>,
<<16#FC, 16#80, 16#80, 16#80>>,
<<16#FC, 16#80, 16#80, 16#80, 16#80>>,
{<<16#FC>>, <<16#EF, 16#BF, 16#BD>>},
{<<16#FC, 16#80>>, <<16#EF, 16#BF, 16#BD>>},
{<<16#FC, 16#80, 16#80>>, <<16#EF, 16#BF, 16#BD>>},
{<<16#FC, 16#80, 16#80, 16#80>>, <<16#EF, 16#BF, 16#BD>>},
{<<16#FC, 16#80, 16#80, 16#80, 16#80>>, <<16#EF, 16#BF, 16#BD>>},

% No data in high bits.
<<16#C0, 16#80>>,
<<16#C1, 16#80>>,
{<<16#C0, 16#80>>, <<"\"\\u0000\"">>},
{<<16#C1, 16#80>>, <<"\"\\u0000\"">>},

<<16#E0, 16#80, 16#80>>,
<<16#E0, 16#90, 16#80>>,
{<<16#E0, 16#80, 16#80>>, <<"\"\\u0000\"">>},
{<<16#E0, 16#90, 16#80>>, <<"\"\\u0000\"">>},

<<16#F0, 16#80, 16#80, 16#80>>,
<<16#F0, 16#88, 16#80, 16#80>>,
{<<16#F0, 16#80, 16#80, 16#80>>, <<"\"\\u0000\"">>},
{<<16#F0, 16#88, 16#80, 16#80>>, <<"\"\\u0000\"">>},

<<16#F8, 16#80, 16#80, 16#80, 16#80>>,
<<16#F8, 16#84, 16#80, 16#80, 16#80>>,
{<<16#F8, 16#80, 16#80, 16#80, 16#80>>, <<"\"\\u0000\"">>},
{<<16#F8, 16#84, 16#80, 16#80, 16#80>>, <<"\"\\u0000\"">>},

<<16#FC, 16#80, 16#80, 16#80, 16#80, 16#80>>,
<<16#FC, 16#82, 16#80, 16#80, 16#80, 16#80>>
{<<16#FC, 16#80, 16#80, 16#80, 16#80, 16#80>>, <<"\"\\u0000\"">>},
{<<16#FC, 16#82, 16#80, 16#80, 16#80, 16#80>>, <<"\"\\u0000\"">>}
].

0 comments on commit 414827d

Please sign in to comment.