Skip to content

Commit

Permalink
Provide a more accurate size check for max_document_size limit
Browse files Browse the repository at this point in the history
max_document_size currently checks document sizes based on Erlang's external
term size of the jiffy-decoded document body. This makes sense because that's
what used to store the data on disk and it's what manipulated by the CouchDB
internals.

However erlang term size is not always a good approximation of the size of json
encoded data. Sometimes it can be way off (I've seen 30% off) and It's hard for
users to estimate or check the external term size beforehand. So for example if
max_document_size is 1MB, CouchDB might reject user's 600KB json document
because Erlang's external term size of that document greater than 1MB.

To fix the issue provide a module which calculates the encoded size of a json
document. The size calculation approximates as well, since there is no
canonical json size as it depends on the encoder used.

Issue apache#659
  • Loading branch information
nickva committed Sep 12, 2017
1 parent ef8a934 commit de4f311
Show file tree
Hide file tree
Showing 4 changed files with 173 additions and 2 deletions.
2 changes: 1 addition & 1 deletion src/chttpd/test/chttpd_db_doc_size_tests.erl
Expand Up @@ -93,7 +93,7 @@ put_single_doc(Url) ->

bulk_doc(Url) ->
NewDoc = "{\"docs\": [{\"doc1\": 1}, {\"errordoc\":
\"this_should_be_the_error_document\"}]}",
\"this_should_be_the_too_large_error_document\"}]}",
{ok, _, _, ResultBody} = test_request:post(Url ++ "/_bulk_docs/",
[?CONTENT_JSON, ?AUTH], NewDoc),
ResultJson = ?JSON_DECODE(ResultBody),
Expand Down
2 changes: 1 addition & 1 deletion src/couch/src/couch_doc.erl
Expand Up @@ -128,7 +128,7 @@ doc_to_json_obj(#doc{id=Id,deleted=Del,body=Body,revs={Start, RevIds},
from_json_obj_validate(EJson) ->
MaxSize = config:get_integer("couchdb", "max_document_size", 4294967296),
Doc = from_json_obj(EJson),
case erlang:external_size(Doc#doc.body) =< MaxSize of
case couch_ejson_size:encoded_size(Doc#doc.body) =< MaxSize of
true ->
validate_attachment_sizes(Doc#doc.atts),
Doc;
Expand Down
99 changes: 99 additions & 0 deletions src/couch/src/couch_ejson_size.erl
@@ -0,0 +1,99 @@
% Licensed under the Apache License, Version 2.0 (the "License"); you may not
% use this file except in compliance with the License. You may obtain a copy of
% the License at
%
% http://www.apache.org/licenses/LICENSE-2.0
%
% Unless required by applicable law or agreed to in writing, software
% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
% License for the specific language governing permissions and limitations under
% the License.

-module(couch_ejson_size).

-export([encoded_size/1]).


%% Compound objects

encoded_size({[]}) ->
2; % opening { and closing }

encoded_size({KVs}) ->
% Would add 2 because opening { and closing }, but then inside the LC
% would accumulate an extra , at the end so subtract 2 - 1
1 + lists:sum([encoded_size(K) + encoded_size(V) + 2 || {K,V} <- KVs]);

encoded_size([]) ->
2; % opening [ and closing ]

encoded_size(List) when is_list(List) ->
% 2 is for [ and ] but inside LC would accumulate an extra , so subtract
% 2 - 1
1 + lists:sum([encoded_size(V) + 1 || V <- List]);

%% Floats.

encoded_size(0.0) ->
3;

encoded_size(1.0) ->
3;

encoded_size(Float) when is_float(Float), Float < 0.0 ->
encoded_size(-Float) + 1;

encoded_size(Float) when is_float(Float), Float < 1.0 ->
if
Float =< 1.0e-300 -> 3; % close enough to 0.0
Float =< 1.0e-100 -> 6; % Xe-YYY
Float =< 1.0e-10 -> 5; % Xe-YY
Float =< 0.01 -> 4; % Xe-Y, 0.0X
true -> 3 % 0.X
end;

encoded_size(Float) when is_float(Float) ->
if
Float >= 1.0e100 -> 5; % XeYYY
Float >= 1.0e10 -> 4; % XeYY
true -> 3 % XeY, X.Y
end;

%% Integers

encoded_size(0) ->
1;

encoded_size(Integer) when is_integer(Integer), Integer < 0 ->
encoded_size(-Integer) + 1;

encoded_size(Integer) when is_integer(Integer) ->
if
Integer < 10 -> 1;
Integer < 100 -> 2;
Integer < 1000 -> 3;
Integer < 10000 -> 4;
true -> trunc(math:log10(Integer)) + 1
end;

%% Strings

encoded_size(Binary) when is_binary(Binary) ->
2 + byte_size(Binary);

%% Special terminal symbols as atoms

encoded_size(null) ->
4;

encoded_size(true) ->
4;

encoded_size(false) ->
5;

%% Other atoms

encoded_size(Atom) when is_atom(Atom) ->
encoded_size(atom_to_binary(Atom, utf8)).
72 changes: 72 additions & 0 deletions src/couch/test/couch_ejson_size_tests.erl
@@ -0,0 +1,72 @@
% Licensed under the Apache License, Version 2.0 (the "License"); you may not
% use this file except in compliance with the License. You may obtain a copy of
% the License at
%
% http://www.apache.org/licenses/LICENSE-2.0
%
% Unless required by applicable law or agreed to in writing, software
% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
% License for the specific language governing permissions and limitations under
% the License.

-module(couch_ejson_size_tests).

-include_lib("eunit/include/eunit.hrl").

-define(HWAIR, $\x{10348}). % 4 byte utf8 encoding
-define(EURO, $\x{20ac}). % 3 byte utf8 encoding
-define(CENT, $\x{a2}). % 2 byte utf8 encoding


ejson_size_test_() ->
[?_assertEqual(R, couch_ejson_size:encoded_size(Input)) || {R, Input} <- [
{1, 1}, {1, 1}, {2, -1}, {1, 9}, {2, 10}, {3, -10},
{2, 11}, {2, 99}, {3, 100}, {3, 999}, {4, 1000}, {4, 9999},
{5, 10000},

{3, 0.0}, {3, 0.1}, {3, 1.0}, {4, -1.0}, {3, 1.0e9},
{4, 1.0e10}, {5, 1.0e-10}, {5, 1.0e-99}, {6, 1.0e-100}, {3, 1.0e-323},

{2, arr_nested(0)}, {22, arr_nested(10)}, {2002, arr_nested(1000)},
{9, obj_nested(0)}, {69, obj_nested(10)}, {6009, obj_nested(1000)},

{4, null}, {4, true}, {5, false},

{3, str(1, $x)}, {4, str(1, ?CENT)}, {5, str(1, ?EURO)},
{6, str(1, ?HWAIR)}, {3, str(1, $\x{1})}, {12, str(10, $x)},
{22, str(10, ?CENT)}, {32, str(10, ?EURO)}, {42, str(10, ?HWAIR)},
{12, str(10, $\x{1})}
]].


%% Helper functions

arr_nested(MaxDepth) ->
arr_nested(MaxDepth, 0).


obj_nested(MaxDepth) ->
obj_nested(MaxDepth, 0).


obj(N, K, V) ->
{[{K, V} || _ <- lists:seq(1, N)]}.


str(N, C) ->
unicode:characters_to_binary([C || _ <- lists:seq(1, N)]).


arr_nested(MaxDepth, MaxDepth) ->
[];

arr_nested(MaxDepth, Depth) ->
[arr_nested(MaxDepth, Depth + 1)].


obj_nested(MaxDepth, MaxDepth) ->
obj(1, <<"k">>, <<"v">>);

obj_nested(MaxDepth, Depth) ->
{[{<<"k">>, obj_nested(MaxDepth, Depth + 1)}]}.

0 comments on commit de4f311

Please sign in to comment.