Permalink
Browse files

Improving shinglign/weights for deduping

Now able to process a lot more messages per second with as good
of a significance in simhashing.

This allows us to move away from phash2 as a hash algorithm (32
bits) to MD5 (128bits). While it copies a bit more data for each
communication, it allows much more accuracy in duplicate
detection.
  • Loading branch information...
1 parent 1240b68 commit a950d24ed2c02995ba94a5c9e13fb0e3d0228d19 @ferd committed Oct 16, 2012
Showing with 31 additions and 6 deletions.
  1. +3 −2 rebar.config
  2. +25 −1 src/lager_deduper.erl
  3. +3 −3 test/lager_overload.erl
View
@@ -1,10 +1,11 @@
-{erl_opts, [debug_info]}.
+{erl_opts, [debug_info,
+ {d,'MD5'}]}.
{erl_first_files, ["src/lager_util.erl"]}.
{cover_enabled, true}.
{edoc_opts, [{stylesheet_file, "./priv/edoc.css"}]}.
{deps, [
- {simhash, "0\\.1\\.0",
+ {simhash, "0\\.2\\.0",
{git, "https://github.com/ferd/simhash.git", "master"}}
]}.
View
@@ -17,7 +17,7 @@ start_link() ->
gen_server:start_link({local, ?SERVER}, ?MODULE, [], []).
dedup_notify(Dest, Level, Timestamp, Msg) ->
- Hash = simhash:hash(iolist_to_binary(Msg)),
+ Hash = hash(Msg),
Key = {Level, Hash},
case gen_server:call(?SERVER, {seen, Key}) of
yes ->
@@ -28,6 +28,30 @@ dedup_notify(Dest, Level, Timestamp, Msg) ->
gen_server:cast(?SERVER, {set, Key, {log, Dest, lager_util:level_to_num(Level), Timestamp, Msg}})
end.
+hash([_LvlStr, Loc, Msg]) ->
+ %% The location can be important, but not always -- depends on
+ %% where error logging takes place. We give it a weight equivalent
+ %% to 25% of the total hash, which seemed to strike a fair balance.
+ Res = shingle(Msg),
+ case re:split(Loc, "@") of
+ [_|[MFA]] ->
+ Weight = round(length(Res) * 0.25),
+ simhash:hash([{Weight, MFA} | Res]);
+ _ ->
+ simhash:hash(Res)
+ end.
+
+shingle(IoList) ->
+ %% Equivalent to "\s|,|\\.", or \s|,|\. as a non-escaped regex
+ Pattern = {re_pattern,0,0,
+ <<69,82,67,80,67,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,48,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,93,0,5,27,32,
+ 83,0,5,27,44,83,0,5,27,46,84,0,15,0>>},
+ [{1, X} || X <- re:split(IoList, Pattern), X =/= <<>>].
+
+%hash(Msg) ->
+% simhash:hash(iolist_to_binary(Msg)).
+
init([]) ->
Ref = erlang:start_timer(delay(), self(), dump),
{ok, #state{timer=Ref, db=empty()}}. % TODO: check for a decent DB format
View
@@ -8,7 +8,7 @@
init_regular() ->
error_logger:tty(false),
application:load(lager),
- application:set_env(lager, handlers, [{lager_console_backend, info}]),
+ application:set_env(lager, handlers, [{lager_console_backend, [info,true]}]),
application:set_env(lager, error_logger_redirect, false),
application:start(crypto),
ok=application:start(simhash),
@@ -22,14 +22,14 @@ init_regular() ->
init_dedup() ->
error_logger:tty(false),
application:load(lager),
- application:set_env(lager, handlers, [{lager_console_backend, info}]),
+ application:set_env(lager, handlers, [{lager_console_backend, [info,true]}]),
application:set_env(lager, error_logger_redirect, false),
application:start(crypto),
application:load(simhash),
application:start(simhash),
application:start(compiler),
application:start(syntax_tools),
- application:set_env(lager, duplicate_treshold, 4),
+ application:set_env(lager, duplicate_treshold, 3),
application:set_env(lager, duplicate_dump, 1000),
ok=application:start(lager).

0 comments on commit a950d24

Please sign in to comment.