forked from myui/hivemall
-
Notifications
You must be signed in to change notification settings - Fork 0
Pig_news20 multiclass dataset
daijyc edited this page Mar 2, 2015
·
2 revisions
Get the news20 dataset. http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass.html#news20
BEGIN{ FS=" " }
{
label=$1;
features="{"
for(i=2;i<=NF;i++)
{
if (i!=2)
features = features ","
feature = $i
features = features "(" feature ")";
}
features = features "}"
print NR "\t" label "\t" features;
}
END{}
$ gawk -f conv.awk news20.scale > news20_scale.train
$ gawk -f conv.awk news20.t.scale > news20_scale.test
hadoop fs -copyFromLocal news20_scale.train .
hadoop fs -copyFromLocal news20_scale.test .
register hivemall-0.3-with-dependencies.jar
define collect_set HiveUDAF('collect_set');
define addBias HiveUDF('hivemall.ftvec.AddBiasUDF');
define amplify HiveUDTF('hivemall.ftvec.amplify.AmplifierUDTF', '(3,null)');
rmf news20mc_train_x3
a = load 'news20_scale.train' as (rowid:int, label:int, features:{(featurepair:chararray)});
b = foreach a generate flatten(amplify(3, *)) as (rowid:int,label:float,features:{(featurepair:chararray)});
c = foreach b generate RANDOM() as rand, rowid, label, features;
d = order c by rand;
e = foreach d generate rowid, label, features;
store e into 'news20mc_train_x3';
rmf news20_onevsrest_train_x3
a0 = load 'news20_scale.train' as (rowid:int, label:int, features:{(featurepair:chararray)});
b0 = foreach a0 generate label;
c0 = distinct b0;
a = load 'news20_scale.train' as (rowid:int, label:int, features:{(featurepair:chararray)});
b = cross a, c0;
c = foreach b generate a::rowid as rowid, c0::label as label, (a::label==c0::label?1:-1) as target, a::features as features:{(featurepair:chararray)};
d = foreach c generate flatten(amplify(3, *)) as (rowid:int,label:int,target:int,features:{(featurepair:chararray)});
e = foreach d generate RANDOM() as rand, rowid, label, target, features;
f = order e by rand;
g = foreach f generate rowid, label, target, features;
store g into 'news20_onevsrest_train_x3';
rmf news20mc_test_exploded
a = load 'news20_scale.test' as (rowid:int, label:int, features:{(featurepair:chararray)});
b = foreach a generate rowid, label, addBias(features) as features;
c = foreach b generate rowid, label, flatten(features) as featurepair:chararray;
d = foreach c generate rowid, label, flatten(STRSPLIT(featurepair, ':'));
store d into 'news20mc_test_exploded';