forked from myui/hivemall
-
Notifications
You must be signed in to change notification settings - Fork 0
Pig_MovieLens Matrix Factorization
daijyc edited this page Mar 2, 2015
·
5 revisions
This page explains how to run matrix factorization on MovieLens 1M dataset.
Caution: Matrix factorization is supported in Hivemall v0.3 or later.
a = load 'training' as (userid:int, movieid:int, rating:int);
b = group a all;
c = foreach b generate AVG(a.rating);
dump c;
3.58144125
register hivemall-0.3-with-dependencies.jar
%default mu 3.58176375
%default factor 10
%default iters 50
define array_avg HiveUDAF('hivemall.tools.array.ArrayAvgUDAF');
define train_mf_sgd HiveUDTF('hivemall.mf.MatrixFactorizationSGDUDTF', '(null, null, null, "-factor ${factor} -mu ${mu} -iter ${iters}")');
rmf sgd_model
a = load 'training' as (userid:int, movieid:int, rating:int);
b = foreach a generate flatten(train_mf_sgd(userid, movieid, rating, '-factor ${factor} -mu ${mu} -iter ${iters}')) as (idx:int, u_rank:{(rank:float)}, m_rank:{(rank:float)}, u_bias:float, m_bias:float);
c = group b by idx;
d = foreach c generate group as idx, array_avg(b.u_rank) as Pu, array_avg(b.m_rank) as Qi, AVG(b.u_bias) as Bu, AVG(b.m_bias) as Bi;
store b into 'sgd_model';
See OnlineMatrixFactorizationUDTF#getOption() for other options.
Note that there are no need to set an exact value for $mu. It actually works without setting $mu but recommended to set one for getting a better prediction.
Note: Hivemall also provides train_mf_adagrad for training using AdaGrad.
register hivemall-0.3-with-dependencies.jar
%default mu 3.58176375
define mf_predict HiveUDF('hivemall.mf.MFPredictionUDF');
define mae HiveUDAF('hivemall.evaluation.MeanAbsoluteErrorUDAF');
define rmse HiveUDAF('hivemall.evaluation.RootMeanSquaredErrorUDAF');
rmf predicted
testing = load 'testing' as (userid:int, movieid:int, rating:int);
sgd_model = load 'sgd_model' as (idx:int, Pu:{(float)}, Qi:{(float)}, Bu:double, Bi:double);
c = join testing by userid LEFT OUTER, sgd_model by idx;
d = foreach c generate testing::userid as userid, testing::movieid as movieid, testing::rating as actual, sgd_model::Pu as Pu, sgd_model::Bu as Bu;
e = join d by movieid, sgd_model by idx;
f = foreach e generate d::actual as actual, mf_predict(d::Pu, sgd_model::Qi, d::Bu, sgd_model::Bi, ${mu}) as predicted;
g = group f all;
h = foreach g generate mae(f.(predicted, actual)) as mae, rmse(f.(predicted, actual)) as rmse;
dump h;
0.6728969407733578 (MAE)
0.8584162122694449 (RMSE)
Recommend top-k movies that a user have not ever seen.
register hivemall-0.3-with-dependencies.jar
%default userid 1
%default topk 5
%default mu 3.58176375
define mf_predict HiveUDF('hivemall.mf.MFPredictionUDF');
sgd_model = load 'sgd_model' as (idx:int, Pu:{(float)}, Qi:{(float)}, Bu:double, Bi:double);
training = load 'training' as (userid:int, movieid:int, rating:int);
b = filter training by userid == ${userid};
c = foreach b generate movieid;
d = cogroup sgd_model by idx, c by movieid;
e = filter d by IsEmpty(c.movieid);
f = foreach d generate flatten(sgd_model) as (movieid:int, Pu:{(float)}, Qi:{(float)}, Bu:double, Bi:double);
personal_model = filter sgd_model by idx == ${userid};
personal_model = foreach personal_model generate Pu, Bu;
g = cross f, personal_model;
h = foreach g generate f::movieid as movieid, mf_predict(personal_model::Pu, f::Qi, personal_model::Bu, f::Bi, ${mu}) as predicted;
i = order h by predicted DESC;
j = limit i ${topk};
dump j;
movieid | predicted |
---|---|
920 | 4.8556857 |
1035 | 4.792883 |
1721 | 4.784465 |
527 | 4.75746 |
2503 | 4.717167 |