forked from myui/hivemall
-
Notifications
You must be signed in to change notification settings - Fork 0
Pig_webspam dataset
daijyc edited this page Feb 28, 2015
·
1 revision
Get the dataset from http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#webspam
awk -f conv.awk webspam_wc_normalized_trigram.svm > webspam_raw
hadoop fs -put webspam_raw .
%default xtimes 3
%default shufflebuffersize 100
register hivemall-0.3-with-dependencies.jar
define addBias HiveUDF('hivemall.ftvec.AddBiasUDF');
define rand_amplify HiveUDTF('hivemall.ftvec.amplify.RandomAmplifierUDTF', '(${xtimes}, ${shufflebuffersize}, null)');
rmf webspam_train_x3
a = load 'webspam_raw' as (rowid:int, label:int, features:{(featurepair:chararray)});
b = group a all;
c = foreach b generate COUNT(a) as cnt;
d = foreach a generate RANDOM() as rand, rowid, label, addBias(features) as features:{(featurepair:chararray)};
e = order d by rand;
f = limit e 70000;
describe f;
g = foreach f generate flatten(rand_amplify(${xtimes}, ${shufflebuffersize}, rowid, label, features)) as (rowid:int,label:float,features:{(featurepair:chararray)});
store g into 'webspam_train_x3';
rmf webspam_test_exploded
g = order d by rand desc;
h = limit g c.cnt-70000;
i = foreach h generate rowid, label, features;
store i into 'webspam_test';
j = foreach i generate rowid, label, flatten(features) as featurepair:chararray;
k = foreach j generate rowid, label, flatten(STRSPLIT(featurepair, ':'));
store k into 'webspam_test_exploded';
'''
*Caution:* For this dataset, use small *shufflebuffersize* because each training example has lots of features though (xtimes * shufflebuffersize * N) training examples are cached in memory.