Skip to content

Pig_webspam binary classification

daijyc edited this page Feb 28, 2015 · 4 revisions

PA1

register hivemall-0.3-with-dependencies.jar

define voted_avg HiveUDAF('hivemall.ensemble.bagging.VotedAvgUDAF');

--train_pa1
define train HiveUDTF('hivemall.classifier.PassiveAggressiveUDTF\$PA1');

rmf webspam_model

a = load 'webspam_train_x3' as (rowid:int, label:int, features:{(featurepair:chararray)});
b = foreach a generate rowid, label, features as features:{(featurepair:chararray)};
c = foreach b generate flatten(train(features, label)) as (feature, weight);
d = group c by feature;
e = foreach d generate group as feature, voted_avg(c.weight);
store e into 'webspam_model';
register hivemall-0.3-with-dependencies.jar

rmf webspam_predict

a = load 'webspam_test_exploded' as (rowid:int, label:int, feature:int, value:float);
b = load 'webspam_model' as (feature:int, weight:float);
c = join a by feature LEFT OUTER, b by feature;
d = foreach c generate a::rowid as rowid, b::weight as weight, a::value as value;
e = group d by rowid;
f = foreach e {
    sum = foreach d generate weight * value;
    generate group as rowid, SUM(sum) as sum;
}
g = foreach f generate rowid, sum, CASE WHEN sum > 0.0 then 1 else -1 end as label;
store g into 'webspam_predict';
a = load 'webspam_test' as (rowid:int, label:int);
b = group a all;
c = foreach b generate COUNT(a) as count;

d = load 'webspam_predict' as (rowid:int, total_weight:float, label:int);
e = join a by rowid, d by rowid;
f = filter e by a::label == d::label;
g = group f all;
h = foreach g generate (double)COUNT(f) / c.count;
dump h;

Prediction accuracy: 0.9397321428571429

AROW

register hivemall-0.3-with-dependencies.jar

define argmin_kld HiveUDAF('hivemall.ensemble.ArgminKLDistanceUDAF');

--train_arow
define train HiveUDTF('hivemall.classifier.AROWClassifierUDTF');

rmf webspam_model

a = load 'webspam_train_x3' as (rowid:int, label:int, features:{(featurepair:chararray)});
b = foreach a generate rowid, label, features as features:{(featurepair:chararray)};
c = foreach b generate flatten(train(features, label)) as (feature, weight, covar);
d = group c by feature;
e = foreach d generate group as feature, argmin_kld(c.(weight, covar));
store e into 'webspam_model';
register hivemall-0.3-with-dependencies.jar

rmf webspam_predict

a = load 'webspam_test_exploded' as (rowid:int, label:int, feature:int, value:float);
b = load 'webspam_model' as (feature:int, weight:float);
c = join a by feature LEFT OUTER, b by feature;
d = foreach c generate a::rowid as rowid, b::weight as weight, a::value as value;
e = group d by rowid;
f = foreach e {
    sum = foreach d generate weight * value;
    generate group as rowid, SUM(sum) as sum;
}
g = foreach f generate rowid, sum, CASE WHEN sum > 0.0 then 1 else -1 end as label;
store g into 'webspam_predict';
a = load 'webspam_test' as (rowid:int, label:int);
b = group a all;
c = foreach b generate COUNT(a) as count;

d = load 'webspam_predict' as (rowid:int, total_weight:float, label:int);
e = join a by rowid, d by rowid;
f = filter e by a::label == d::label;
g = group f all;
h = foreach g generate (double)COUNT(f) / c.count;
dump h;

Prediction accuracy: 0.9591321428571429

SCW1

register hivemall-0.3-with-dependencies.jar

define argmin_kld HiveUDAF('hivemall.ensemble.ArgminKLDistanceUDAF');

--train_scw
define train HiveUDTF('hivemall.classifier.SoftConfideceWeightedUDTF\$SCW1');

rmf webspam_model

a = load 'webspam_train_x3' as (rowid:int, label:int, features:{(featurepair:chararray)});
b = foreach a generate rowid, label, features as features:{(featurepair:chararray)};
c = foreach b generate flatten(train(features, label)) as (feature, weight, covar);
d = group c by feature;
e = foreach d generate group as feature, argmin_kld(c.(weight, covar));
store e into 'webspam_model';
register hivemall-0.3-with-dependencies.jar

rmf webspam_predict

a = load 'webspam_test_exploded' as (rowid:int, label:int, feature:int, value:float);
b = load 'webspam_model' as (feature:int, weight:float);
c = join a by feature LEFT OUTER, b by feature;
d = foreach c generate a::rowid as rowid, b::weight as weight, a::value as value;
e = group d by rowid;
f = foreach e {
    sum = foreach d generate weight * value;
    generate group as rowid, SUM(sum) as sum;
}
g = foreach f generate rowid, sum, CASE WHEN sum > 0.0 then 1 else -1 end as label;
store g into 'webspam_predict';
a = load 'webspam_test' as (rowid:int, label:int);
b = group a all;
c = foreach b generate COUNT(a) as count;

d = load 'webspam_predict' as (rowid:int, total_weight:float, label:int);
e = join a by rowid, d by rowid;
f = filter e by a::label == d::label;
g = group f all;
h = foreach g generate (double)COUNT(f) / c.count;
dump h;

Prediction accuracy: 0.9568928571428571

Clone this wiki locally