Skip to content

Pig_news20 multiclass classification #2 (CW, AROW, SCW)

daijyc edited this page Mar 2, 2015 · 3 revisions
Algorithm Accuracy
PA2 0.8201853243175558
SCW1 0.8467317806160781
AROW 0.8437265214124718
SCW2 0.8549962434259955
CW 0.8569997495617331

#[CW]

training

register hivemall-0.3-with-dependencies.jar

define addBias HiveUDF('hivemall.ftvec.AddBiasUDF');
define voted_avg HiveUDAF('hivemall.ensemble.bagging.VotedAvgUDAF');

--train_multiclass_cw
define train HiveUDTF('hivemall.classifier.multiclass.MulticlassConfidenceWeightedUDTF');

rmf news20mc_model

a = load 'news20mc_train_x3' as (rowid:int, label:int, features:{(featurepair:chararray)});
b = foreach a generate rowid, label, addBias(features) as features:{(featurepair:chararray)};
c = foreach b generate flatten(train(features, label)) as (label, feature, weight, covar);
d = group c by (label, feature);
e = foreach d generate flatten(group) as (label, feature), voted_avg(c.weight);
store e into 'news20mc_model';

prediction

register hivemall-0.3-with-dependencies.jar

define maxrow HiveUDAF('hivemall.ensemble.MaxRowUDAF');

rmf news20mc_predict

a = load 'news20mc_test_exploded' as (rowid:int, label:int, feature:int, value:float);
b = load 'news20mc_model' as (label:int, feature:int, weight:float);
c = join a by feature LEFT OUTER, b by feature;
d = foreach c generate a::rowid as rowid, b::label as label, b::weight as weight, a::value as value;
e = group d by (rowid, label);
f = foreach e {
    sum = foreach d generate weight * value;
    generate flatten(group) as (rowid, label), SUM(sum) as score;
}
g = group f by rowid;
h = foreach g generate group as rowid, flatten(maxrow(f.(score, label))) as (score:double, label:int);
store h into 'news20mc_predict';

evaluation

a = load 'news20_scale.test' as (rowid:int, label:int);
b = group a all;
c = foreach b generate COUNT(a) as count;

d = load 'news20mc_predict' as (rowid:int, score:float, label:int);
e = join a by rowid, d by rowid;
f = filter e by a::label == d::label;
g = group f all;
h = foreach g generate (double)COUNT(f) / c.count;
dump h;

0.8569997495617331


#[AROW]

training

register hivemall-0.3-with-dependencies.jar

define addBias HiveUDF('hivemall.ftvec.AddBiasUDF');
define voted_avg HiveUDAF('hivemall.ensemble.bagging.VotedAvgUDAF');

--train_multiclass_arow
define train HiveUDTF('hivemall.classifier.multiclass.MulticlassAROWClassifierUDTF');

rmf news20mc_model

a = load 'news20mc_train_x3' as (rowid:int, label:int, features:{(featurepair:chararray)});
b = foreach a generate rowid, label, addBias(features) as features:{(featurepair:chararray)};
c = foreach b generate flatten(train(features, label)) as (label, feature, weight, covar);
d = group c by (label, feature);
e = foreach d generate flatten(group) as (label, feature), voted_avg(c.weight);
store e into 'news20mc_model';

prediction

register hivemall-0.3-with-dependencies.jar

define maxrow HiveUDAF('hivemall.ensemble.MaxRowUDAF');

rmf news20mc_predict

a = load 'news20mc_test_exploded' as (rowid:int, label:int, feature:int, value:float);
b = load 'news20mc_model' as (label:int, feature:int, weight:float);
c = join a by feature LEFT OUTER, b by feature;
d = foreach c generate a::rowid as rowid, b::label as label, b::weight as weight, a::value as value;
e = group d by (rowid, label);
f = foreach e {
    sum = foreach d generate weight * value;
    generate flatten(group) as (rowid, label), SUM(sum) as score;
}
g = group f by rowid;
h = foreach g generate group as rowid, flatten(maxrow(f.(score, label))) as (score:double, label:int);
store h into 'news20mc_predict';

evaluation

a = load 'news20_scale.test' as (rowid:int, label:int);
b = group a all;
c = foreach b generate COUNT(a) as count;

d = load 'news20mc_predict' as (rowid:int, score:float, label:int);
e = join a by rowid, d by rowid;
f = filter e by a::label == d::label;
g = group f all;
h = foreach g generate (double)COUNT(f) / c.count;
dump h;

0.8437265214124718


#[SCW1]

training

register hivemall-0.3-with-dependencies.jar

define addBias HiveUDF('hivemall.ftvec.AddBiasUDF');
define voted_avg HiveUDAF('hivemall.ensemble.bagging.VotedAvgUDAF');

--train_multiclass_scw
define train HiveUDTF('hivemall.classifier.multiclass.MulticlassSoftConfidenceWeightedUDTF\$SCW1');

rmf news20mc_model

a = load 'news20mc_train_x3' as (rowid:int, label:int, features:{(featurepair:chararray)});
b = foreach a generate rowid, label, addBias(features) as features:{(featurepair:chararray)};
c = foreach b generate flatten(train(features, label)) as (label, feature, weight, covar);
d = group c by (label, feature);
e = foreach d generate flatten(group) as (label, feature), voted_avg(c.weight);
store e into 'news20mc_model';

prediction

register hivemall-0.3-with-dependencies.jar

define maxrow HiveUDAF('hivemall.ensemble.MaxRowUDAF');

rmf news20mc_predict

a = load 'news20mc_test_exploded' as (rowid:int, label:int, feature:int, value:float);
b = load 'news20mc_model' as (label:int, feature:int, weight:float);
c = join a by feature LEFT OUTER, b by feature;
d = foreach c generate a::rowid as rowid, b::label as label, b::weight as weight, a::value as value;
e = group d by (rowid, label);
f = foreach e {
    sum = foreach d generate weight * value;
    generate flatten(group) as (rowid, label), SUM(sum) as score;
}
g = group f by rowid;
h = foreach g generate group as rowid, flatten(maxrow(f.(score, label))) as (score:double, label:int);
store h into 'news20mc_predict';

evaluation

a = load 'news20_scale.test' as (rowid:int, label:int);
b = group a all;
c = foreach b generate COUNT(a) as count;

d = load 'news20mc_predict' as (rowid:int, score:float, label:int);
e = join a by rowid, d by rowid;
f = filter e by a::label == d::label;
g = group f all;
h = foreach g generate (double)COUNT(f) / c.count;
dump h;

0.8467317806160781


#[SCW2]

training

register hivemall-0.3-with-dependencies.jar

define addBias HiveUDF('hivemall.ftvec.AddBiasUDF');
define voted_avg HiveUDAF('hivemall.ensemble.bagging.VotedAvgUDAF');

--train_multiclass_scw2
define train HiveUDTF('hivemall.classifier.multiclass.MulticlassSoftConfidenceWeightedUDTF\$SCW2');

rmf news20mc_model

a = load 'news20mc_train_x3' as (rowid:int, label:int, features:{(featurepair:chararray)});
b = foreach a generate rowid, label, addBias(features) as features:{(featurepair:chararray)};
c = foreach b generate flatten(train(features, label)) as (label, feature, weight, covar);
d = group c by (label, feature);
e = foreach d generate flatten(group) as (label, feature), voted_avg(c.weight);
store e into 'news20mc_model';

prediction

register hivemall-0.3-with-dependencies.jar

define maxrow HiveUDAF('hivemall.ensemble.MaxRowUDAF');

rmf news20mc_predict

a = load 'news20mc_test_exploded' as (rowid:int, label:int, feature:int, value:float);
b = load 'news20mc_model' as (label:int, feature:int, weight:float);
c = join a by feature LEFT OUTER, b by feature;
d = foreach c generate a::rowid as rowid, b::label as label, b::weight as weight, a::value as value;
e = group d by (rowid, label);
f = foreach e {
    sum = foreach d generate weight * value;
    generate flatten(group) as (rowid, label), SUM(sum) as score;
}
g = group f by rowid;
h = foreach g generate group as rowid, flatten(maxrow(f.(score, label))) as (score:double, label:int);
store h into 'news20mc_predict';

evaluation

a = load 'news20_scale.test' as (rowid:int, label:int);
b = group a all;
c = foreach b generate COUNT(a) as count;

d = load 'news20mc_predict' as (rowid:int, score:float, label:int);
e = join a by rowid, d by rowid;
f = filter e by a::label == d::label;
g = group f all;
h = foreach g generate (double)COUNT(f) / c.count;
dump h;

0.8549962434259955

Clone this wiki locally