Skip to content

Pig_news20 multiclass classification #1 (PA)

daijyc edited this page Mar 2, 2015 · 4 revisions

#[Passive Aggressive (PA2)]

Training

model building

register hivemall-0.3-with-dependencies.jar

define addBias HiveUDF('hivemall.ftvec.AddBiasUDF');
define voted_avg HiveUDAF('hivemall.ensemble.bagging.VotedAvgUDAF');

--train_multiclass_pa2
define train HiveUDTF('hivemall.classifier.multiclass.MulticlassPassiveAggressiveUDTF\$PA2');

rmf news20mc_model

a = load 'news20mc_train_x3' as (rowid:int, label:int, features:{(featurepair:chararray)});
b = foreach a generate rowid, label, addBias(features) as features:{(featurepair:chararray)};
c = foreach b generate flatten(train(features, label)) as (label, feature, weight);
d = group c by (label, feature);
e = foreach d generate flatten(group) as (label, feature), voted_avg(c.weight);
store e into 'news20mc_model';

prediction

register hivemall-0.3-with-dependencies.jar

define maxrow HiveUDAF('hivemall.ensemble.MaxRowUDAF');

rmf news20mc_predict

a = load 'news20mc_test_exploded' as (rowid:int, label:int, feature:int, value:float);
b = load 'news20mc_model' as (label:int, feature:int, weight:float);
c = join a by feature LEFT OUTER, b by feature;
d = foreach c generate a::rowid as rowid, b::label as label, b::weight as weight, a::value as value;
e = group d by (rowid, label);
f = foreach e {
    sum = foreach d generate weight * value;
    generate flatten(group) as (rowid, label), SUM(sum) as score;
}
g = group f by rowid;
h = foreach g generate group as rowid, flatten(maxrow(f.(score, label))) as (score:double, label:int);
store h into 'news20mc_predict';

evaluation

a = load 'news20_scale.test' as (rowid:int, label:int);
b = group a all;
c = foreach b generate COUNT(a) as count;

d = load 'news20mc_predict' as (rowid:int, score:float, label:int);
e = join a by rowid, d by rowid;
f = filter e by a::label == d::label;
g = group f all;
h = foreach g generate (double)COUNT(f) / c.count;
dump h;

0.8201853243175558 (x3)

Clone this wiki locally