forked from myui/hivemall
Pig_news20 binary dataset
daijyc edited this page Feb 24, 2015
·
2 revisions
Get the news20b dataset. http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#news20.binary
cat conv.awk
BEGIN{ FS=" " }
{
label=$1;
features="{"
for(i=2;i<=NF;i++)
{
if (i!=2)
features = features ","
feature = $i
features = features "(" feature ")";
}
features = features "}"
print NR "\t" label "\t" features;
}
END{}
sort -R news20.binary > news20.random
head -15000 news20.random > news20.train
tail -4996 news20.random > news20.test
gawk -f conv.awk news20.train > news20.train.t
gawk -f conv.awk news20.test > news20.test.t
hadoop fs -copyFromLocal news20.train.t .
hadoop fs -copyFromLocal news20.test.t .
register hivemall-0.3-with-dependencies.jar
define addBias HiveUDF('hivemall.ftvec.AddBiasUDF');
define amplify HiveUDTF('hivemall.ftvec.amplify.AmplifierUDTF', '(3,null,null)');
rmf news20_train_x3
a = load 'news20.train.t' as (rowid:int, label:int, features:{(featurepair:chararray)});
b = foreach a generate flatten(amplify(3, *)) as (rowid:int,label:float,features:{(featurepair:chararray)});
c = foreach b generate RANDOM() as rand, rowid, label, features;
d = order c by rand;
e = foreach d generate rowid, label, features;
store e into 'news20_train_x3';
rmf news20_test_exploded
a = load 'news20.test.t' as (rowid:int, label:int, features:{(featurepair:chararray)});
b = foreach a generate rowid, label, addBias(features) as features;
c = foreach b generate rowid, label, flatten(features) as featurepair:chararray;
d = foreach c generate rowid, label, flatten(STRSPLIT(featurepair, ':'));
store d into 'news20_test_exploded';