forked from myui/hivemall
-
Notifications
You must be signed in to change notification settings - Fork 0
Pig_news20 k NN search using b Bits minhash
daijyc edited this page Mar 2, 2015
·
2 revisions
register hivemall-0.3-with-dependencies.jar
%default topn 10
define bbit_minhash HiveUDF('hivemall.knn.lsh.bBitMinHashUDF');
define jaccard HiveUDF('hivemall.knn.distance.JaccardIndexUDF');
a = load 'news20_scale.train' as (rowid:int, label:int, features:{(featurepair:chararray)});
b = foreach a generate rowid, bbit_minhash(features, false) as signature;
c = load 'news20_scale.test' as (rowid:int, label:int, features:{(featurepair:chararray)});
d = filter c by rowid == 1;
e = foreach d generate bbit_minhash(features, 128, false) as signature;
f = cross b, e;
g = foreach f generate b::rowid as rowid, jaccard(b::signature, e::signature, 128) as similarity;
h = order g by similarity desc;
i = limit h ${topn};
dump i;
rowid | similarity |
---|---|
11952 | 0.390625 |
10748 | 0.359375 |
12902 | 0.34375 |
12669 | 0.328125 |
3839 | 0.328125 |
11493 | 0.328125 |
3 | 0.328125 |
3087 | 0.328125 |
6333 | 0.3125 |
13604 | 0.3125 |