news20 binary classification #2 (CW, AROW, SCW)

UDF preparation

use news20;

delete jar /home/myui/tmp/hivemall.jar;
add jar /home/myui/tmp/hivemall.jar;
source /home/myui/tmp/define-all.hive;

Confidece Weighted (CW)

training

drop table news20b_cw_model1;
create table news20b_cw_model1 as
select 
 feature,
 cast(voted_avg(weight) as float) as weight
from 
 (select 
     -- train_cw(addBias(features), label) as (feature, weight) -- [hivemall v0.1]
     train_cw(addBias(features), label) as (feature, weight, covar) -- [hivemall v0.2 or later]
  from 
     news20b_train_x3
 ) t 
group by feature;

prediction

create or replace view news20b_cw_predict1 
as
select
  t.rowid, 
  sum(m.weight * t.value) as total_weight,
  case when sum(m.weight * t.value) > 0.0 then 1 else -1 end as label
from 
  news20b_test_exploded t LEFT OUTER JOIN
  news20b_cw_model1 m ON (t.feature = m.feature)
group by
  t.rowid;

evaluation

create or replace view news20b_cw_submit1 
as
select 
  t.rowid,
  t.label as actual, 
  pd.label as predicted
from 
  news20b_test t JOIN news20b_cw_predict1 pd 
    on (t.rowid = pd.rowid);

select count(1)/4996 from news20b_cw_submit1 
where actual = predicted;

0.9655724579663731

Cleaning

drop table news20b_cw_model1;
drop view news20b_cw_predict1;
drop view news20b_cw_submit1;

Adaptive Regularization of Weight Vectors (AROW)

training

drop table news20b_arow_model1;
create table news20b_arow_model1 as
select 
 feature,
 cast(voted_avg(weight) as float) as weight
from 
 (select 
     -- train_arow(addBias(features),label) as (feature,weight) -- [hivemall v0.1]
     train_arow(addBias(features),label) as (feature,weight,covar) -- [hivemall v0.2 or later]
  from 
     news20b_train_x3
 ) t 
group by feature;

prediction

create or replace view news20b_arow_predict1 
as
select
  t.rowid, 
  sum(m.weight * t.value) as total_weight,
  case when sum(m.weight * t.value) > 0.0 then 1 else -1 end as label
from 
  news20b_test_exploded t LEFT OUTER JOIN
  news20b_arow_model1 m ON (t.feature = m.feature)
group by
  t.rowid;

evaluation

create or replace view news20b_arow_submit1 as
select 
  t.rowid, 
  t.label as actual, 
  pd.label as predicted
from 
  news20b_test t JOIN news20b_arow_predict1 pd 
    on (t.rowid = pd.rowid);

select count(1)/4996 from news20b_arow_submit1 
where actual = predicted;

0.9659727782225781

Cleaning

drop table news20b_arow_model1;
drop view news20b_arow_predict1;
drop view news20b_arow_submit1;

Soft Confidence-Weighted (SCW1)

training

drop table news20b_scw_model1;
create table news20b_scw_model1 as
select 
 feature,
 cast(voted_avg(weight) as float) as weight
from 
 (select 
     -- train_scw(addBias(features),label) as (feature,weight) -- [hivemall v0.1]
     train_scw(addBias(features),label) as (feature,weight,covar) -- [hivemall v0.2 or later]
  from 
     news20b_train_x3
 ) t 
group by feature;

prediction

create or replace view news20b_scw_predict1 
as
select
  t.rowid, 
  sum(m.weight * t.value) as total_weight,
  case when sum(m.weight * t.value) > 0.0 then 1 else -1 end as label
from 
  news20b_test_exploded t LEFT OUTER JOIN
  news20b_scw_model1 m ON (t.feature = m.feature)
group by
  t.rowid;

evaluation

create or replace view news20b_scw_submit1 as
select 
  t.rowid, 
  t.label as actual, 
  pd.label as predicted
from 
  news20b_test t JOIN news20b_scw_predict1 pd 
    on (t.rowid = pd.rowid);

select count(1)/4996 from news20b_scw_submit1 
where actual = predicted;

0.9661729383506805

Cleaning

drop table news20b_scw_model1;
drop view news20b_scw_predict1;
drop view news20b_scw_submit1;

Soft Confidence-Weighted (SCW2)

training

drop table news20b_scw2_model1;
create table news20b_scw2_model1 as
select 
 feature,
 cast(voted_avg(weight) as float) as weight
from 
 (select 
     -- train_scw2(addBias(features),label) as (feature,weight)    -- [hivemall v0.1]
     train_scw2(addBias(features),label) as (feature,weight,covar) -- [hivemall v0.2 or later]
  from 
     news20b_train_x3
 ) t 
group by feature;

prediction

create or replace view news20b_scw2_predict1 
as
select
  t.rowid, 
  sum(m.weight * t.value) as total_weight,
  case when sum(m.weight * t.value) > 0.0 then 1 else -1 end as label
from 
  news20b_test_exploded t LEFT OUTER JOIN
  news20b_scw2_model1 m ON (t.feature = m.feature)
group by
  t.rowid;

evaluation

create or replace view news20b_scw2_submit1 as
select 
  t.rowid, 
  t.label as actual, 
  pd.label as predicted
from 
  news20b_test t JOIN news20b_scw2_predict1 pd 
    on (t.rowid = pd.rowid);

select count(1)/4996 from news20b_scw2_submit1 
where actual = predicted;

0.9579663730984788

Cleaning

drop table news20b_scw2_model1;
drop view news20b_scw2_predict1;
drop view news20b_scw2_submit1;

--

Algorithm	Accuracy
Perceptron	0.9459567654123299
SCW2	0.9579663730984788
PA2	0.9597678142514011
PA1	0.9601681345076061
PA	0.9603682946357086
CW	0.9655724579663731
AROW	0.9659727782225781
SCW1	0.9661729383506805

My recommendation is AROW for classification.

Analytics

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

news20 binary classification #2 (CW, AROW, SCW)

UDF preparation

Confidece Weighted (CW)

training

prediction

evaluation

Cleaning

Adaptive Regularization of Weight Vectors (AROW)

training

prediction

evaluation

Cleaning

Soft Confidence-Weighted (SCW1)

training

prediction

evaluation

Cleaning

Soft Confidence-Weighted (SCW2)

training

prediction

evaluation

Cleaning

Clone this wiki locally