Skip to content

Commit

Permalink
Adds string length util for Enum columns.
Browse files Browse the repository at this point in the history
  • Loading branch information
bghill committed Sep 23, 2015
1 parent 0eb9ec0 commit de0b19c
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 2 deletions.
25 changes: 23 additions & 2 deletions h2o-core/src/main/java/water/rapids/ASTStrOp.java
Expand Up @@ -2,6 +2,7 @@

import org.apache.commons.lang.StringUtils;
import water.MRTask;
import water.MemoryManager;
import water.fvec.CStrChunk;
import water.fvec.Chunk;
import water.fvec.Frame;
Expand Down Expand Up @@ -274,8 +275,28 @@ class ASTStrLength extends ASTPrim {
return new ValFrame(new Frame(res));
}
private Vec lengthEnumCol(Vec vec) {
// FIXME not implemented yet
return vec.makeCopy();
String[] doms = vec.domain();
int[] catLengths = new int[doms.length];
for (int i = 0; i < doms.length; ++i) catLengths[i] = doms[i].length();
Vec res = new MRTask() {
transient int[] catLengths;
@Override public void setupLocal() {
String[] doms = _fr.anyVec().domain();
catLengths = new int[doms.length];
for (int i = 0; i < doms.length; ++i) catLengths[i] = doms[i].length();
}
@Override public void map(Chunk chk, NewChunk newChk){
// pre-allocate since the size is known
newChk._ls = MemoryManager.malloc8(chk._len);
newChk._xs = MemoryManager.malloc4(chk._len); // sadly, a waste
for (int i =0; i < chk._len; i++)
if(chk.isNA(i))
newChk.addNA();
else
newChk.addNum(catLengths[(int)chk.atd(i)],0);
}
}.doAll(1, vec).outputFrame().anyVec();
return res;
}

private Vec lengthStringCol(Vec vec) {
Expand Down
18 changes: 18 additions & 0 deletions h2o-py/tests/testdir_munging/pyunit_length.py
Expand Up @@ -5,6 +5,24 @@
def length_check():
# Connect to a pre-existing cluster

# Test on enums
frame = h2o.import_file(path=h2o.locate("smalldata/junit/cars_trim.csv"))

# single column (frame)
length_frame = frame["name"].length()
assert length_frame[0,0] == 26, "Expected 26, but got {}".format(length_frame[0,0])
assert length_frame[1,0] == 19, "Expected 19, but got {}".format(length_frame[1,0])
assert length_frame[2,0] == 19, "Expected 19, but got {}".format(length_frame[2,0])

# single column (vec)
vec = frame["name"]
trimmed_vec = vec.trim()
length_vec = trimmed_vec.length()
assert length_vec[0,0] == 23, "Expected 23, but got {}".format(length_vec[0,0])
assert length_vec[1,0] == 18, "Expected 18, but got {}".format(length_vec[1,0])
assert length_vec[2,0] == 18, "Expected 18, but got {}".format(length_vec[2,0])

# Test on strings
frame = h2o.import_file(path=h2o.locate("smalldata/junit/cars_trim.csv"), col_types=["string","numeric","numeric","numeric","numeric","numeric","numeric","numeric"])

# single column (frame)
Expand Down

0 comments on commit de0b19c

Please sign in to comment.