Skip to content
Makoto YUI edited this page Oct 18, 2013 · 5 revisions

a9a

http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#a9a


preparation

conv.awk

cd /mnt/archive/datasets/classification/a9a
awk -f conv.awk a9a | sed -e "s/+1/1/" | sed -e "s/-1/0/" > a9a.train
awk -f conv.awk a9a.t | sed -e "s/+1/1/" | sed -e "s/-1/0/" > a9a.test

Putting data on HDFS

hadoop fs -mkdir /dataset/a9a/train
hadoop fs -mkdir /dataset/a9a/test

hadoop fs -copyFromLocal a9a.train /dataset/a9a/train
hadoop fs -copyFromLocal a9a.test /dataset/a9a/test

Training/test data prepareation

create database a9a;
use a9a;

delete jar /home/myui/tmp/hivemall.jar;
add jar /home/myui/tmp/hivemall.jar;

source /home/myui/tmp/define-all.hive;

Create external table a9atrain (
  rowid int,
  label float,
  features ARRAY<STRING>
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY "," STORED AS TEXTFILE LOCATION '/dataset/a9a/train';

Create external table a9atest (
  rowid int, 
  label float,
  features ARRAY<STRING>
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY "," STORED AS TEXTFILE LOCATION '/dataset/a9a/test';

create table a9atrain_exploded as
select 
  rowid,
  label, 
  cast(split(feature,":")[0] as int) feature,
  cast(split(feature,":")[1] as float) as value
from 
  a9atrain LATERAL VIEW explode(addBias(features)) t AS feature;

create table a9atest_exploded as
select 
  rowid,
  label,
  cast(split(feature,":")[0] as int) as feature,
  cast(split(feature,":")[1] as float) as value
from 
  a9atest LATERAL VIEW explode(addBias(features)) t AS feature;
Clone this wiki locally