forked from myui/hivemall
-
Notifications
You must be signed in to change notification settings - Fork 0
Pig_Recommendation using Min wise LSH (minhash)
daijyc edited this page Mar 2, 2015
·
2 revisions
List related (similar) articles for each article.
register hivemall-0.3-with-dependencies.jar
define collect_set HiveUDAF('collect_set');
define minhash HiveUDTF('hivemall.knn.lsh.MinHashUDTF');
rmf news20mc_cluster
rmf news20mc_similar_articles
a = load 'news20_scale.train' as (rowid:int, label:int, features:{(featurepair:chararray)});
b = foreach a generate flatten(minhash(rowid, features)) as (clusterId, rowid);
c = group b by clusterId;
d = foreach c {
d0 = distinct b.rowid;
generate group, d0 as rowids;
}
store d into 'news20mc_cluster';
e = foreach b generate *;
f = join b by clusterId left outer, e by clusterId;
g = filter f by b::rowid != e::rowid;
h = foreach g generate b::rowid as rowid, e::rowid as similarrowid;
i = group h by rowid;
j = foreach i {
j0 = distinct h.similarrowid;
generate group as rowid, j0 as similarrowid;
}
k = order j by rowid;
store k into 'news20mc_similar_articles';