Permalink
Browse files

CBD-453 Exclude non-utf8 doc ids from views

Change-Id: I9fbe3e6a875f75e838b4fae6cbce098aec2217a2
Reviewed-on: http://review.couchbase.org/20210
Tested-by: Filipe David Borba Manana <fdmanana@gmail.com>
Reviewed-by: Volker Mische <volker.mische@gmail.com>
  • Loading branch information...
1 parent b945921 commit 8f678a3c88d4941f771f45ace416bff735de1686 Damien Katz committed with Peter Wansch Aug 27, 2012
Showing with 31 additions and 34 deletions.
  1. +22 −4 src/couch_set_view/src/couch_set_view_updater.erl
  2. +9 −30 src/couchdb/couch_util.erl
@@ -338,7 +338,7 @@ load_changes(Owner, Updater, Group, MapQueue, Writer, ActiveParts, PassiveParts)
try
Since = couch_util:get_value(PartId, SinceSeqs),
ChangesWrapper = fun(DocInfo, _, AccCount2) ->
- load_doc(Db, PartId, DocInfo, MapQueue),
+ load_doc(Db, PartId, DocInfo, MapQueue, Group),
{ok, AccCount2 + 1}
end,
{ok, _, AccCount3} = couch_db:fast_reads(Db, fun() ->
@@ -389,7 +389,7 @@ notify_owner(Owner, Msg, UpdaterPid) ->
Owner ! {updater_info, UpdaterPid, Msg}.
-load_doc(Db, PartitionId, DocInfo, MapQueue) ->
+load_doc(Db, PartitionId, DocInfo, MapQueue, Group) ->
#doc_info{id=DocId, local_seq=Seq, deleted=Deleted} = DocInfo,
case DocId of
<<?DESIGN_DOC_PREFIX, _/binary>> ->
@@ -398,8 +398,26 @@ load_doc(Db, PartitionId, DocInfo, MapQueue) ->
if Deleted ->
couch_work_queue:queue(MapQueue, {Seq, #doc{id=DocId, deleted=true}, PartitionId});
true ->
- {ok, Doc} = couch_db:open_doc_int(Db, DocInfo, []),
- couch_work_queue:queue(MapQueue, {Seq, Doc, PartitionId})
+ case couch_util:validate_utf8(DocId) of
+ true ->
+ {ok, Doc} = couch_db:open_doc_int(Db, DocInfo, []),
+ couch_work_queue:queue(MapQueue, {Seq, Doc, PartitionId});
+ false ->
+ #set_view_group{
+ set_name = SetName,
+ name = DDocId,
+ type = GroupType
+ } = Group,
+ % If the id isn't utf8 (memcached allows it), then log an error
+ % message and skip the doc. Send it through the queue anyway
+ % so we record the high seq num in case there are a bunch of
+ % these at the end, we want to keep track of the high seq and
+ % not reprocess again.
+ ?LOG_MAPREDUCE_ERROR("Bucket `~s`, ~s group `~s`, skipping "
+ "document with non-utf8 id. Doc id bytes: ~w",
+ [SetName, GroupType, DDocId, ?b2l(DocId)]),
+ couch_work_queue:queue(MapQueue, {Seq, #doc{id=DocId, deleted=true}, PartitionId})
+ end
end
end.
View
@@ -102,36 +102,15 @@ simple_call(Pid, Message) ->
erlang:demonitor(MRef, [flush])
end.
-validate_utf8(Data) when is_list(Data) ->
- validate_utf8(?l2b(Data));
-validate_utf8(Bin) when is_binary(Bin) ->
- validate_utf8_fast(Bin, 0).
-
-validate_utf8_fast(B, O) ->
- case B of
- <<_:O/binary>> ->
- true;
- <<_:O/binary, C1, _/binary>> when
- C1 < 128 ->
- validate_utf8_fast(B, 1 + O);
- <<_:O/binary, C1, C2, _/binary>> when
- C1 >= 194, C1 =< 223,
- C2 >= 128, C2 =< 191 ->
- validate_utf8_fast(B, 2 + O);
- <<_:O/binary, C1, C2, C3, _/binary>> when
- C1 >= 224, C1 =< 239,
- C2 >= 128, C2 =< 191,
- C3 >= 128, C3 =< 191 ->
- validate_utf8_fast(B, 3 + O);
- <<_:O/binary, C1, C2, C3, C4, _/binary>> when
- C1 >= 240, C1 =< 244,
- C2 >= 128, C2 =< 191,
- C3 >= 128, C3 =< 191,
- C4 >= 128, C4 =< 191 ->
- validate_utf8_fast(B, 4 + O);
- _ ->
- false
- end.
+validate_utf8(Val) when is_binary(Val) ->
+ case unicode:characters_to_binary(Val, utf8, utf8) of
+ Bin when Val =:= Bin ->
+ true;
+ _ ->
+ false
+ end;
+validate_utf8(Val) when is_list(Val) ->
+ validate_utf8(?l2b(Val)).
to_hex([]) ->
[];

0 comments on commit 8f678a3

Please sign in to comment.