Skip to content

Pig_news20 multiclass dataset

daijyc edited this page Mar 2, 2015 · 2 revisions

Get the news20 dataset. http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass.html#news20

BEGIN{ FS=" " }
{
    label=$1;
    features="{"
    for(i=2;i<=NF;i++)
    {
        if (i!=2)
            features = features ","
        feature = $i
        features = features "(" feature ")";
    }
    features = features "}"
    print NR "\t" label "\t" features;
}
END{}

$ gawk -f conv.awk news20.scale > news20_scale.train
$ gawk -f conv.awk news20.t.scale > news20_scale.test

Putting data on HDFS

hadoop fs -copyFromLocal news20_scale.train .
hadoop fs -copyFromLocal news20_scale.test .

Training/test data prepareation

register hivemall-0.3-with-dependencies.jar

define collect_set HiveUDAF('collect_set');
define addBias HiveUDF('hivemall.ftvec.AddBiasUDF');
define amplify HiveUDTF('hivemall.ftvec.amplify.AmplifierUDTF', '(3,null)');

rmf news20mc_train_x3

a = load 'news20_scale.train' as (rowid:int, label:int, features:{(featurepair:chararray)});
b = foreach a generate flatten(amplify(3, *)) as (rowid:int,label:float,features:{(featurepair:chararray)});
c = foreach b generate RANDOM() as rand, rowid, label, features;
d = order c by rand;
e = foreach d generate rowid, label, features;
store e into 'news20mc_train_x3';

rmf news20_onevsrest_train_x3

a0 = load 'news20_scale.train' as (rowid:int, label:int, features:{(featurepair:chararray)});
b0 = foreach a0 generate label;
c0 = distinct b0;

a = load 'news20_scale.train' as (rowid:int, label:int, features:{(featurepair:chararray)});
b = cross a, c0;
c = foreach b generate a::rowid as rowid, c0::label as label, (a::label==c0::label?1:-1) as target, a::features as features:{(featurepair:chararray)};
d = foreach c generate flatten(amplify(3, *)) as (rowid:int,label:int,target:int,features:{(featurepair:chararray)});
e = foreach d generate RANDOM() as rand, rowid, label, target, features;
f = order e by rand;
g = foreach f generate rowid, label, target, features;
store g into 'news20_onevsrest_train_x3';

rmf news20mc_test_exploded

a = load 'news20_scale.test' as (rowid:int, label:int, features:{(featurepair:chararray)});
b = foreach a generate rowid, label, addBias(features) as features;
c = foreach b generate rowid, label, flatten(features) as featurepair:chararray;
d = foreach c generate rowid, label, flatten(STRSPLIT(featurepair, ':'));
store d into 'news20mc_test_exploded';
Clone this wiki locally