Skip to content

Pig_Recommendation using Min wise LSH (minhash)

daijyc edited this page Mar 2, 2015 · 2 revisions

List related (similar) articles for each article.

Get recommendations

register hivemall-0.3-with-dependencies.jar

define collect_set HiveUDAF('collect_set');
define minhash HiveUDTF('hivemall.knn.lsh.MinHashUDTF');

rmf news20mc_cluster
rmf news20mc_similar_articles

a = load 'news20_scale.train' as (rowid:int, label:int, features:{(featurepair:chararray)});
b = foreach a generate flatten(minhash(rowid, features)) as (clusterId, rowid);
c = group b by clusterId;
d = foreach c {
  d0 = distinct b.rowid;
  generate group, d0 as rowids;
}
store d into 'news20mc_cluster';

e = foreach b generate *;
f = join b by clusterId left outer, e by clusterId;
g = filter f by b::rowid != e::rowid;
h = foreach g generate b::rowid as rowid, e::rowid as similarrowid;
i = group h by rowid;
j = foreach i {
  j0 = distinct h.similarrowid;
  generate group as rowid, j0 as similarrowid;
}
k = order j by rowid;
store k into 'news20mc_similar_articles';
Clone this wiki locally