Skip to content

Pig_E2006 tfidf regression dataset

daijyc edited this page Mar 2, 2015 · 3 revisions

http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression.html#E2006-tfidf

Prerequisite

Data preparation

awk -f conv_pig.awk E2006.train > E2006.train.tsv
awk -f conv_pig.awk  E2006.test > E2006.test.tsv
register hivemall-0.3-with-dependencies.jar

define addBias HiveUDF('hivemall.ftvec.AddBiasUDF');
define amplify HiveUDTF('hivemall.ftvec.amplify.AmplifierUDTF', '(3,null,null)');

rmf e2006tfidf_train_x3

a = load 'E2006.train.tsv' as (rowid:int, target:float, features:{(featurepair:chararray)});
b = foreach a generate flatten(amplify(3, *)) as (rowid:int,target:float,features:{(featurepair:chararray)});
c = foreach b generate RANDOM() as rand, rowid, target, features;
d = order c by rand;
e = foreach d generate rowid, target, features;
store e into 'e2006tfidf_train_x3';

rmf e2006tfidf_test_exploded

a = load 'E2006.test.tsv' as (rowid:int, target:float, features:{(featurepair:chararray)});
b = foreach a generate rowid, target, addBias(features) as features;
c = foreach b generate rowid, target, flatten(features) as featurepair:chararray;
d = foreach c generate rowid, target, flatten(STRSPLIT(featurepair, ':'));
store d into 'e2006tfidf_test_exploded';
Clone this wiki locally