Skip to content

Commit

Permalink
Adding custom hash functions, tests & MD5 default
Browse files Browse the repository at this point in the history
- MD5 is the default simhashing algorithm, for the accuracy/speed
  balance
- Added a way to customize the hashing algorithm at run time.
- Common Test tests!
  • Loading branch information
ferd committed Oct 25, 2012
1 parent 806686a commit b7237fa
Show file tree
Hide file tree
Showing 6 changed files with 152 additions and 18 deletions.
16 changes: 16 additions & 0 deletions Makefile
@@ -0,0 +1,16 @@
.PHONY: all test clean
PREFIX:=../
DEST:=$(PREFIX)$(PROJECT)

REBAR=./rebar

all:
@$(REBAR) compile

test:
@$(REBAR) compile
@mkdir -p logs/
@ct_run -name ct_node -spec ct.spec -pa `pwd`/ebin/ `pwd`/include/ -env ERL_LIBS `pwd`/deps

clean:
@rm -rf ebin/ logs/
40 changes: 31 additions & 9 deletions README.md
Expand Up @@ -25,22 +25,24 @@ For more resources on simhashing, you may read the following:

How To Build
------------
The module can be compiled using `./rebar compile`.
The module can be compiled using `./rebar compile` or `make`.

By default, the simhash library will use SHA-160 as the function to
hash the shingles made from the binary structure. It is the most
accurate one, but also the slowest one.
By default, the simhash library will use MD5 as the function to
hash the shingles made from the binary structure. It is the second most
accurate one, but also the second slowest one, striking a decent balance.

By passing macros, other hashing algorithms can be used:
- `PHASH` for Erlang's `phash2` (32 bits, fastest, least accurate)
- `MD5` for MD5 (128 bits, slower, more accurate)
- `SHA` for SHA-160 (default) (slowest, most accurate).
- `MD5` for MD5 (default) (128 bits, slower, more accurate)
- `SHA` for SHA-160 (slowest, most accurate).

If you want to use MD5 or phash2 hashing, it is recommended you
If you want to use SHA-160 or phash2 hashing by default, it is recommended you
provide the macros in your own `rebar` config or whatever other
tool that lets you declare them when compiling (`{d,'MD5'}` for
tool that lets you declare them when compiling (`{d,'SHA'}` for
example).

To run tests, call `make test`.

How To Use It
-------------

Expand Down Expand Up @@ -113,6 +115,20 @@ weighed features, so that some items are worth more than others:

In the tests above, you can see that by giving more weigh to the color, it's possible to make the simhash behave differently to the same original string.

Finally, it is possible to use the simhash library with your own hash function if you wish to do so. The hash function must accept a binary and return a binary. You will also need to provide an argument explaining how many bits is contained in your hashes:

12> F = fun(X) -> crypto:hash_final(crypto:hash_update(crypto:hash_init(sha512), X)) end.
#Fun<erl_eval.6.82930912>
13> F(<<"abc">>).
<<221,175,53,161,147,...>>

The function `F` defines a simple way to call sha512 hashes from the crypto module. It can be used with simhashes as follows:

14> simhash:hash(<<"abcdef">>, F, 512).
<<60,149,116,223,113,...>>
15> simhash:hash([{5,<<"ab">>},{2, <<"cdef">>}], F, 512).
<<180,232,215,0,38,245,...>>

Notes
-----

Expand All @@ -122,4 +138,10 @@ to production anywhere else. Handle with caution.
Changelog
---------

0.2.0: Adding a way to submit a user's own features/shingles with weight.
### 0.3.0: ###
- MD5 is the default simhashing algorithm, for the accuracy/speed balance
- Added a way to customize the hashing algorithm at run time.
- Common Test tests!

### 0.2.0: ###
- Adding a way to submit a user's own features/shingles with weight.
5 changes: 5 additions & 0 deletions ct.spec
@@ -0,0 +1,5 @@
{alias, root, "./test/"}.
{logdir, "./logs/"}.

%% all suites (we only have one!)
{suites, root, all}.
2 changes: 1 addition & 1 deletion src/simhash.app.src
@@ -1,6 +1,6 @@
{application, simhash,
[{description, "Simhashing for Erlang"},
{vsn, "0.2.0"},
{vsn, "0.3.0"},
{registered, []},
{applications, [kernel, stdlib, crypto]}
]}.
34 changes: 26 additions & 8 deletions src/simhash.erl
@@ -1,5 +1,5 @@
-module(simhash).
-export([hash/1, closest/2, distance/2]).
-export([hash/1, hash/3, closest/2, distance/2]).
-compile([native]).

-ifdef(TEST).
Expand All @@ -15,9 +15,9 @@
%% simhash.
-define(DUP_WEIGHT_ADD,1).

%% Default random hash used by simhash is sha-160
%% Default random hash used by simhash is MD5
-ifndef(PHASH). -ifndef(MD5). -ifndef(SHA).
-define(SHA, true).
-define(MD5, true).
-endif. -endif. -endif.

%% erlang:phash2 is the fastest, but sadly
Expand Down Expand Up @@ -48,7 +48,10 @@
-type simhash() :: binary().
-type feature() :: {Weight::pos_integer(), binary()}.
-type features() :: [feature()].
-export_type([simhash/0, feature/0, features/0]).
-type hashfun() :: fun((binary()) -> binary()).
-type hashsize() :: pos_integer().
-export_type([simhash/0, feature/0, features/0,
hashfun/0, hashsize/0]).

%% Takes any binary and returns a simhash for that data.
-spec hash(binary()) -> simhash()
Expand All @@ -58,6 +61,15 @@ hash(Bin = <<_/binary>>) ->
hash(Features = [_|_]) ->
simhash_features(Features).

%% Takes any binary and returns a simhash for that data, based
%% on whatever hash and size is given by the user.
-spec hash(binary(), hashfun(), hashsize()) -> simhash()
; (features(), hashfun(), hashsize()) -> simhash().
hash(Bin = <<_/binary>>, HashFun, Size) ->
hashed_shingles(Bin, ?SHINGLE_SIZE, HashFun, Size);
hash(Features = [_|_], HashFun, Size) ->
simhash_features(Features, HashFun, Size).

%% Takes a given simhash and returns the closest simhash
%% in a second list, based on their Hamming distance.
-spec closest(simhash(), [simhash(),...]) -> {non_neg_integer(), simhash()}.
Expand All @@ -78,11 +90,18 @@ simhash_features(Features) ->
Hashes = [{W, ?HASH(Feature)} || {W,Feature} <- Features],
to_sim(reduce(Hashes, ?SIZE-1)).

simhash_features(Features, Hash, Size) ->
Hashes = [{W, Hash(Feature)} || {W,Feature} <- Features],
to_sim(reduce(Hashes, Size-1)).

%% Returns a set of shingles, hashed according to the algorithm
%% used when compiling the module.
hashed_shingles(Bin, Size) ->
simhash_features(shingles(Bin, Size)).

hashed_shingles(Bin, Size, HashFun, HashSize) ->
simhash_features(shingles(Bin, Size), HashFun, HashSize).

%% The vector returned from reduce/2 is taken and flattened
%% by its content -- values greater or equal to 0 end up being 1,
%% and those smaller than 0 end up being 0.
Expand Down Expand Up @@ -160,10 +179,9 @@ hamming(X,Y,Pos,Sum) ->
end.

-ifdef(TEST).
%%%%%%%%%%%%%
%%% TESTS %%%
%%%%%%%%%%%%%
%% ad-hoc tests/benchmarks
%%% ad-hoc demos/benches, useful when fiddling with new features
%%% that can require manual validation, without actually impacting
%%% tests in test/simhash_SUITE.erl.

test() ->
L = [<<"the cat sat on the mat">>,<<"the cat sat on a mat">>,
Expand Down
73 changes: 73 additions & 0 deletions test/simhash_SUITE.erl
@@ -0,0 +1,73 @@
-module(simhash_SUITE).
-include_lib("common_test/include/ct.hrl").

-export([init_per_suite/1, end_per_suite/1, all/0]).

-export([closer_type/1, commutative_distance/1, id_has_0_distance/1,
custom_features/1, custom_hash/1]).
all() -> [closer_type, commutative_distance, id_has_0_distance,
custom_features, custom_hash].

init_per_suite(Config) ->
Bytes = crypto:rand_bytes(32),
case {simhash:hash(Bytes), simhash:hash(Bytes, fun erlang:md5/1, 128)} of
{X,X} -> Config;
{_,_} -> {skip, "Tests assume the default hash algorithm is MD5"}
end.

end_per_suite(_Config) ->
ok.

%% fortunately, similar strings end up closer than binary
%% versions of unrelated data types.
closer_type(_Config) ->
Voice1 = simhash:hash(<<"My voice is my password.">>),
Voice2 = simhash:hash(<<"My voice is my passport.">>),
Pid = simhash:hash(term_to_binary(self())),
true = simhash:distance(Voice1, Pid) > simhash:distance(Voice2, Pid).

%% The distance between hash should be commutative.
commutative_distance(_Config) ->
V1 = simhash:hash(<<"My voice is my password.">>),
V2 = simhash:hash(<<"My voice is my passport.">>),
true = simhash:distance(V1, V2) == simhash:distance(V2, V1).

%% Identical items should have identical simhashes
id_has_0_distance(_Config) ->
Hash = simhash:hash(<<"My voice is my password.">>),
0 = simhash:distance(Hash, Hash).

%% Features are lists of weighed binaries to be hashed according
%% to such weight. When simhash:hash/1 doesn't receive a binary,
%% we assume it's a feature list.
custom_features(_Config) ->
H1 = simhash:hash([{1,<<"my">>}, {1,<<"car">>}, {1,<<"is">>},
{1,<<"black">>}]),
H2 = simhash:hash([{1,<<"my">>}, {1,<<"car">>}, {1,<<"is">>},
{1,<<"blue">>}]),
H3 = simhash:hash([{1,<<"my">>}, {1,<<"car">>}, {1,<<"is">>},
{5,<<"blue">>}]),
H4 = simhash:hash([{1,<<"my">>}, {1,<<"car">>}, {1,<<"is">>},
{5,<<"black">>}]),
H5 = simhash:hash([{1,<<"my">>}, {1,<<"car">>}, {1,<<"is">>},
{0,<<"blue">>}]),
H6 = simhash:hash([{1,<<"my">>}, {1,<<"car">>}, {1,<<"is">>},
{0,<<"black">>}]),
true = simhash:distance(H1, H2) =/= simhash:distance(H3, H4),
0 = simhash:distance(H5, H6).

custom_hash(_Config) ->
Term1 = [{1,<<"my">>}, {1,<<"car">>}, {1,<<"is">>}, {5,<<"black">>}],
Term2 = <<"some binary string">>,
Sha512 = fun(X) ->
crypto:hash_final(crypto:hash_update(crypto:hash_init(sha512), X))
end,
Phash2 = fun(X) -> <<(erlang:phash2(X,4294967296)):32>> end,
true = simhash:hash(Term1, Sha512, 512) =/= simhash:hash(Term1),
true = simhash:hash(Term2, Sha512, 512) =/= simhash:hash(Term2),
true = simhash:hash(Term1, Phash2, 32) =/= simhash:hash(Term1),
true = simhash:hash(Term2, Phash2, 32) =/= simhash:hash(Term2),
true = simhash:hash(Term2, Phash2, 32) =:= simhash:hash(Term2, Phash2, 32),
true = simhash:hash(Term1, fun erlang:md5/1, 128) =:= simhash:hash(Term1),
true = simhash:hash(Term2, fun erlang:md5/1, 128) =:= simhash:hash(Term2).

0 comments on commit b7237fa

Please sign in to comment.