Skip to content

Commit

Permalink
PUBDEV-1559: Add "Few Unique Doubles" chunk type and JUnit test.
Browse files Browse the repository at this point in the history
  • Loading branch information
arnocandel committed Aug 27, 2015
1 parent b4c4a4e commit 5ef3008
Show file tree
Hide file tree
Showing 5 changed files with 161 additions and 7 deletions.
80 changes: 80 additions & 0 deletions h2o-core/src/main/java/water/fvec/CUDChunk.java
@@ -0,0 +1,80 @@
package water.fvec;

import water.AutoBuffer;
import water.MemoryManager;
import water.util.UnsafeUtils;
import java.util.HashSet;

/**
* The "few unique doubles"-compression function
*/
public class CUDChunk extends Chunk {
public static int MAX_UNIQUES=256;
public static int computeByteSize(int uniques, int len) {
return (uniques << 3) //unique double values
+ (len << 1) //mapping of row -> unique value index (0...255)
+ 4; //numUniques
}
int numUniques;
CUDChunk(byte[] bs, HashSet<Double> hs, int len) {
numUniques = hs.size();
set_len(len);
_mem = MemoryManager.malloc1(computeByteSize(numUniques, _len), false);
int j=0;
for (Double d : hs)
UnsafeUtils.set8d(_mem, j++ << 3, d);
for (int i=0; i<len; ++i) {
double d = UnsafeUtils.get8d(bs, i << 3);
int pos = -1;
for (j=0; j<numUniques; ++j) //binary search not needed for now
if (Double.compare(d, UnsafeUtils.get8d(_mem, j << 3)) == 0)
pos = j;
assert(pos >= 0);
assert((byte)pos==pos);
UnsafeUtils.set1(_mem, (numUniques << 3) + i, (byte)pos);
}
UnsafeUtils.set4(_mem, (numUniques << 3) + len, numUniques);
_start = -1;
}
@Override protected final long at8_impl( int i ) {
i = UnsafeUtils.get1(_mem, (numUniques << 3) + i);
double res = UnsafeUtils.get8d(_mem, i << 3);
if( Double.isNaN(res) ) throw new IllegalArgumentException("at8_impl but value is missing");
return (long)res;
}
@Override protected final double atd_impl( int i ) {
return UnsafeUtils.get8d(_mem, UnsafeUtils.get1(_mem, (numUniques << 3) + i) << 3);
}
@Override protected final boolean isNA_impl( int i ) { return Double.isNaN(atd_impl(i)); }
@Override boolean set_impl(int idx, long l) { return false; }
@Override boolean set_impl(int i, double d) {
for (int j = 0; j < numUniques; ++j) {
if (d == UnsafeUtils.get8d(_mem, j << 3)) {
UnsafeUtils.set1(_mem, (numUniques << 3) + i, (byte) j);
return true;
}
}
return false;
}
@Override boolean set_impl(int i, float f ) {
return set_impl(i, (double)f);
}
@Override boolean setNA_impl(int idx) {
return set_impl(idx, Double.NaN);
}
@Override public NewChunk inflate_impl(NewChunk nc) {
nc.alloc_doubles(_len);
for( int i=0; i< _len; i++ )
nc.doubles()[i] = atd_impl(i);
nc.set_sparseLen(nc.set_len(_len));
return nc;
}
@Override public AutoBuffer write_impl(AutoBuffer bb) {return bb.putA1(_mem,_mem.length); }
@Override public CUDChunk read_impl(AutoBuffer bb) {
_mem = bb.bufClose();
_start = -1; _cidx = -1;
numUniques = UnsafeUtils.get4(_mem, _mem.length-4);
set_len(_mem.length-4-numUniques<<3);
return this;
}
}
20 changes: 13 additions & 7 deletions h2o-core/src/main/java/water/fvec/NewChunk.java
Expand Up @@ -4,9 +4,7 @@
import water.Futures;
import water.H2O;
import water.MemoryManager;
import water.parser.ParseTime;
import water.parser.ValueString;
import water.util.Log;
import water.util.PrettyPrint;
import water.util.UnsafeUtils;

Expand Down Expand Up @@ -958,10 +956,10 @@ private byte[] bufS(final int valsz){
break;
case 4:
int ival = (int)lval;
UnsafeUtils.set4(buf, off+ridsz, ival);
UnsafeUtils.set4(buf, off + ridsz, ival);
break;
case 8:
UnsafeUtils.set8(buf, off+ridsz, lval);
UnsafeUtils.set8(buf, off + ridsz, lval);
break;
default:
throw H2O.fail();
Expand Down Expand Up @@ -1031,20 +1029,28 @@ private byte[] bufX( long bias, int scale, int off, int log ) {

// Compute a compressed double buffer
private Chunk chunkD() {
HashSet<Double> hs = new HashSet<>();
HashSet<Double> hs = new HashSet<>(CUDChunk.MAX_UNIQUES);
final byte [] bs = MemoryManager.malloc1(_len *8,true);
int j = 0;
boolean fitsInUnique = true;
for(int i = 0; i < _len; ++i){
double d = 0;
if(_id == null || _id.length == 0 || (j < _id.length && _id[j] == i)) {
d = _ds != null?_ds[j]:(isNA2(j)||isEnum(j))?Double.NaN:_ls[j]*PrettyPrint.pow10(_xs[j]);
++j;
}
hs.add(d);
if (fitsInUnique && hs.size() < CUDChunk.MAX_UNIQUES) {
hs.add(d);
} else {
fitsInUnique = false;
}
UnsafeUtils.set8d(bs, 8*i, d);
}
assert j == sparseLen() :"j = " + j + ", _len = " + sparseLen();
Log.info("FillRate: " + hs.size() + "/" + len() + " = " + ((float)hs.size()/len()));
// See if it's possible and worth the computational overhead to compress into CSDChunk
// -> Heuristic: should at least save 50% in memory compared to C8DChunk
if (fitsInUnique && CUDChunk.computeByteSize(hs.size(), len()) < 0.5 * len()*8)
return new CUDChunk(bs, hs, len());
return new C8DChunk(bs);
}

Expand Down
2 changes: 2 additions & 0 deletions h2o-core/src/main/java/water/util/ChunkSummary.java
Expand Up @@ -30,6 +30,7 @@ public class ChunkSummary extends MRTask<ChunkSummary> {
"C16", // UUID
"CStr", // Strings
"CXD", // Sparse doubles
"CUD", // Few Unique doubles
"C8D", //leave this as last -> no compression
};
final transient static String[] chunkNames = new String[]{
Expand All @@ -50,6 +51,7 @@ public class ChunkSummary extends MRTask<ChunkSummary> {
"128-bit UUID",
"String",
"Sparse Reals",
"Unique Reals",
"64-bit Reals",
};

Expand Down
2 changes: 2 additions & 0 deletions h2o-core/src/main/java/water/util/UnsafeUtils.java
Expand Up @@ -6,12 +6,14 @@
public class UnsafeUtils {
private static final Unsafe _unsafe = UtilUnsafe.getUnsafe();
private static final long _Bbase = _unsafe.arrayBaseOffset(byte[].class);
public static byte get1 ( byte[] buf, int off ) { return _unsafe.getByte (buf, _Bbase+off); }
public static int get2 ( byte[] buf, int off ) { return _unsafe.getShort (buf, _Bbase+off); }
public static int get4 ( byte[] buf, int off ) { return _unsafe.getInt (buf, _Bbase+off); }
public static long get8 ( byte[] buf, int off ) { return _unsafe.getLong (buf, _Bbase+off); }
public static float get4f( byte[] buf, int off ) { return _unsafe.getFloat (buf, _Bbase+off); }
public static double get8d( byte[] buf, int off ) { return _unsafe.getDouble(buf, _Bbase+off); }

public static int set1 (byte[] buf, int off, byte x ) {_unsafe.putByte (buf, _Bbase+off, x); return 1;}
public static int set2 (byte[] buf, int off, short x ) {_unsafe.putShort (buf, _Bbase+off, x); return 2;}
public static int set4 (byte[] buf, int off, int x ) {_unsafe.putInt (buf, _Bbase+off, x); return 4;}
public static int set4f(byte[] buf, int off, float f ) {_unsafe.putFloat (buf, _Bbase+off, f); return 4;}
Expand Down
64 changes: 64 additions & 0 deletions h2o-core/src/test/java/water/fvec/CUDChunkTest.java
@@ -0,0 +1,64 @@
package water.fvec;

import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import water.TestUtil;

import java.util.Arrays;

public class CUDChunkTest extends TestUtil {
@BeforeClass() public static void setup() { stall_till_cloudsize(1); }
@Test
public void test_inflate_impl() {
NewChunk nc = new NewChunk(null, 0);
final double a = -3.1415926e-118;
final double b = 23423423.234234234;
final double c = 0.00103E217;
double[] vals = new double[]{
Double.NaN, Double.MIN_VALUE, Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, a, 0, b, c, Double.MAX_VALUE,
Double.NaN, Double.MIN_VALUE, Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, a, 0, b, c, Double.MAX_VALUE,
Double.NaN, Double.MIN_VALUE, Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, a, 0, b, c, Double.MAX_VALUE,
a, Double.NaN, Double.MIN_VALUE, Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, a, 0, b, c, Double.MAX_VALUE,
Double.NaN, Double.MIN_VALUE, Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, a, 0, b, c, Double.MAX_VALUE,
Double.NaN, b, Double.MIN_VALUE, Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, a, 0, b, c, Double.MAX_VALUE,
Double.NaN, Double.MIN_VALUE, Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, a, 0, b, c, Double.MAX_VALUE,
Double.NaN, Double.MIN_VALUE, c, Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, a, 0, b, c, Double.MAX_VALUE,
Double.NaN, Double.MIN_VALUE, Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, a, 0, b, c, Double.MAX_VALUE,
Double.NaN, Double.MIN_VALUE, Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, a, a, 0, b, c, Double.MAX_VALUE,
Double.NaN, b, Double.MIN_VALUE, Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, a, 0, b, c, Double.MAX_VALUE,
Double.NaN, Double.MIN_VALUE, Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, a, 0, b, c, Double.MAX_VALUE,
Double.NaN, Double.MIN_VALUE, Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, a, 0, b, c, Double.MAX_VALUE,
Double.NaN, Double.MIN_VALUE, Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, a, 0, b, c, Double.MAX_VALUE,
Double.NaN, Double.MIN_VALUE, Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, a, 0, b, c, Double.MAX_VALUE,
Double.NaN, Double.MIN_VALUE, Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, a, 0, b, c, Double.MAX_VALUE,
};
for (double v : vals) nc.addNum(v);
nc.addNA();

Chunk cc = nc.compress();
Assert.assertEquals(vals.length + 1, cc._len);
Assert.assertTrue(cc instanceof CUDChunk);
for (int i = 0; i < vals.length; ++i) Assert.assertEquals(vals[i], cc.atd(i), Math.ulp(vals[i]));
for (int i = 0; i < vals.length; ++i) Assert.assertEquals(vals[i], cc.at_abs(i), Math.ulp(vals[i]));
Assert.assertTrue(cc.isNA(vals.length));
Assert.assertTrue(cc.isNA_abs(vals.length));

nc = cc.inflate_impl(new NewChunk(null, 0));
nc.values(0, nc._len);
Assert.assertEquals(vals.length + 1, nc._len);
for (int i = 0; i < vals.length; ++i) Assert.assertEquals(vals[i], nc.atd(i), Math.ulp(vals[i]));
for (int i = 0; i < vals.length; ++i) Assert.assertEquals(vals[i], nc.at_abs(i), Math.ulp(vals[i]));
Assert.assertTrue(nc.isNA(vals.length));
Assert.assertTrue(nc.isNA_abs(vals.length));

Chunk cc2 = nc.compress();
Assert.assertEquals(vals.length + 1, cc._len);
Assert.assertTrue(cc2 instanceof CUDChunk);
for (int i = 0; i < vals.length; ++i) Assert.assertEquals(vals[i], cc2.atd(i), Math.ulp(vals[i]));
for (int i = 0; i < vals.length; ++i) Assert.assertEquals(vals[i], cc2.at_abs(i), Math.ulp(vals[i]));
Assert.assertTrue(cc2.isNA(vals.length));
Assert.assertTrue(cc2.isNA_abs(vals.length));
Assert.assertTrue(Arrays.equals(cc._mem, cc2._mem));
}
}

0 comments on commit 5ef3008

Please sign in to comment.