Skip to content

Commit

Permalink
bit packed integer encoding
Browse files Browse the repository at this point in the history
  • Loading branch information
dgllghr committed Jan 29, 2024
1 parent 1032020 commit cd47ec8
Show file tree
Hide file tree
Showing 4 changed files with 327 additions and 14 deletions.
17 changes: 5 additions & 12 deletions src/stripe/encode/bit_packed_bool.zig
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,9 @@ comptime {
}

pub const Validator = struct {
const Self = @This();

count: u32,

const Self = @This();
pub const Encoder = bit_packed_bool.Encoder;

pub fn init() Self {
Expand All @@ -43,13 +42,11 @@ pub const Validator = struct {
};

pub const Encoder = struct {
const Self = @This();

const BitIndexInt = u6;

word: Word,
bit_index: BitIndexInt,

const Self = @This();
const BitIndexInt = u6;
const Value = bool;

fn init() Self {
Expand Down Expand Up @@ -83,11 +80,11 @@ pub const Encoder = struct {
};

pub const Decoder = struct {
const Self = @This();

index: usize,
current_word: ?Word,

const Self = @This();

pub fn init() Self {
return .{
.index = 0,
Expand Down Expand Up @@ -118,10 +115,6 @@ pub const Decoder = struct {
return (self.current_word.? >> bit_index) & 1 > 0;
}

pub fn readAll(_: *Self, _: []bool, _: anytype) !void {
@panic("todo");
}

fn loadWord(self: *Self, blob: anytype, word_index: usize) !void {
var buf: [@sizeOf(Word)]u8 = undefined;
const byte_index = word_index * @sizeOf(Word);
Expand Down
310 changes: 310 additions & 0 deletions src/stripe/encode/bit_packed_int.zig
Original file line number Diff line number Diff line change
@@ -0,0 +1,310 @@
//! Packs zig-zag encoded integers from least significant bit to most in 64 bit, unsigned integers.
//! Integers are encoded as little endian bytes.

const std = @import("std");
const debug = std.debug;
const mem = std.mem;
const testing = std.testing;

const Encoding = @import("../encoding.zig").Encoding;
const Error = @import("../error.zig").Error;
const Valid = @import("../validator.zig").Valid;

const bit_packed_int = @This();

pub const Validator = struct {
bit_width: u8,
count: u32,

const Self = @This();
pub const Encoder = bit_packed_int.Encoder;

pub fn init() Self {
return .{
.bit_width = 0,
.count = 0,
};
}

pub fn next(self: *Self, value: i64) void {
// Always zig-zag encode tha value
// TODO is it faster to skip zig zag if all inputs are >= 0?
// TODO is it faster to calculate zig zag bit width without doing the full encoding?
const value_zz = zig_zag.encode(value);
const sig_bits: u8 = 64 - @clz(value_zz);
if (sig_bits > self.bit_width) {
self.bit_width = sig_bits;
}

self.count += 1;
}

pub fn end(self: Self) !Valid(Self.Encoder) {
// Do not support 64 bit bit-packing because then there isn't any point to packing
if (self.bit_width == 64) {
return Error.NotEncodable;
}

// The number of octets written is always a factor of 8 (64 bits) so that reads and writes
// operate on a 64-bit integer in memory and loads and stores operate on that same integer
// Add 1 to the length to account for the byte that stores the bit width
const byte_len = (((self.count * self.bit_width) + 64 - 1) / 64) * 8 + 1;
return .{
.meta = .{
.byte_len = byte_len,
.encoding = Encoding.BitPacked,
},
.encoder = Self.Encoder.init(@intCast(self.bit_width)),
};
}
};

pub const Encoder = struct {
bit_width: u6,

word_bit_index: u6,
word: u64,

const Self = @This();

fn init(bit_width: u6) Self {
return .{ .bit_width = bit_width, .word_bit_index = 0, .word = 0 };
}

pub fn deinit(_: *Self) void {}

pub fn begin(self: *Self, writer: anytype) !bool {
try writer.writeByte(self.bit_width);
return true;
}

pub fn write(self: *Self, writer: anytype, value: i64) !void {
const value_zz = zig_zag.encode(value);
// The int cast is safe because bit index is <= 63 at this point
self.word |= value_zz << @intCast(self.word_bit_index);
const bits_used: u8 = @as(u8, 64) - self.word_bit_index;
self.word_bit_index +%= self.bit_width;

if (bits_used <= self.bit_width) {
// Flush the word
try writer.writeInt(u64, self.word, .little);

// Some bits might have been truncated. Write them to the next word
self.word = value_zz >> @intCast(bits_used);
}
}

pub fn end(self: *Self, writer: anytype) !void {
if (self.word_bit_index > 0) {
try writer.writeInt(u64, self.word, .little);
}
}
};

pub const Decoder = struct {
bit_width: u6,

/// Global bit index
bit_index: usize,
current_words: ?struct {
// Keep a window of 2 words because values can span at most two words
buf: [2]u64,
word_index: usize,
},

const Self = @This();

pub fn init() Self {
return .{
.bit_width = 0,
.bit_index = 0,
.current_words = null,
};
}

pub fn begin(self: *Self, blob: anytype) !void {
var buf: [1]u8 = undefined;
try blob.readAt(buf[0..], 0);
debug.assert(buf[0] < 64);
self.bit_width = @intCast(buf[0]);
}

pub fn next(self: *Self, n: u32) void {
self.bit_index += @as(usize, self.bit_width) * n;

if (self.current_words != null and
self.currEndWordIndex() - self.current_words.?.word_index > 1)
{
self.current_words = null;
}
}

pub fn read(self: *Self, blob: anytype) !i64 {
if (self.current_words == null) {
try self.loadWords(blob);
}

const curr_words = self.current_words.?;
const word_bit_index: u6 = @intCast(self.bit_index % 64);
if (curr_words.word_index == self.currWordIndex()) {
// Value starts in the buf[0] word. The value may be spread across two words
const lower: u64 = (curr_words.buf[0] >> word_bit_index) &
((@as(u64, 1) << self.bit_width) - 1);

const end_bit_index: u8 = @as(u8, word_bit_index) + self.bit_width;
if (end_bit_index > 64) {
const upper: u64 = (curr_words.buf[1] &
((@as(u64, 1) << @as(u6, @intCast(end_bit_index - 64))) - 1));
const full = (upper << @intCast(@as(u8, 64) - word_bit_index)) | lower;
return zig_zag.decode(full);
}

return zig_zag.decode(lower);
}

// Value starts in the buf[1] word. The entire value must fit in the buf[1] word
debug.assert(self.currWordIndex() == curr_words.word_index + 1);
const value: u64 = (curr_words.buf[1] >> word_bit_index) &
((@as(u64, 1) << @intCast(self.bit_width)) - 1);
return zig_zag.decode(value);
}

fn loadWords(self: *Self, blob: anytype) !void {
var buf: [16]u8 = undefined;
const word_index = self.currWordIndex();
// Add 1 to account for the byte that stores the bit width
const byte_index = word_index * 8 + 1;

// Account for byte that stores the bit width
if (byte_index + 8 == blob.len()) {
// Read the last byte
try blob.readAt(buf[0..8], byte_index);
self.current_words = .{
.buf = [2]u64{
mem.readInt(u64, buf[0..8], .little),
undefined,
},
.word_index = word_index,
};
return;
}

debug.assert(byte_index + 8 < blob.len());
try blob.readAt(buf[0..], byte_index);
self.current_words = .{
.buf = [2]u64{
mem.readInt(u64, buf[0..8], .little),
mem.readInt(u64, buf[8..], .little),
},
.word_index = word_index,
};
}

fn currWordIndex(self: *Self) usize {
return self.bit_index / 64;
}

fn currEndWordIndex(self: *Self) usize {
return (self.bit_index + self.bit_width - 1) / 64;
}
};

const zig_zag = struct {
pub fn encode(value: i64) u64 {
return @bitCast((2 * value) ^ (value >> (8 * 8 - 1)));
}

pub fn decode(value: u64) i64 {
return @as(i64, @bitCast(value >> 1)) ^ (-@as(i64, @bitCast(value & 1)));
}
};

const neg_10_to_10_encoded_bytes = [_]u8{
0x33, 0xBE, 0xB6, 0xD2, 0x29, 0x23, 0x00, 0x41,
0x0C, 0x52, 0xCC, 0x41, 0x49, 0x01, 0x00, 0x00,
};

test "bit packed int: encode" {
const allocator = testing.allocator;

var data = try std.ArrayList(u8).initCapacity(allocator, 17);
defer data.deinit();
const writer = data.writer();

var encoder = Encoder.init(5);
const cont = try encoder.begin(writer);
try testing.expectEqual(true, cont);
var v: i64 = -10;
while (v <= 10) {
try encoder.write(writer, v);
v += 1;
}
try encoder.end(writer);

try testing.expectEqual(5, data.items[0]);
try testing.expectEqualSlices(u8, neg_10_to_10_encoded_bytes[0..], data.items[1..]);
}

test "bit packed int: decode" {
const MemoryBlob = @import("../../MemoryBlob.zig");

var blob_data: [neg_10_to_10_encoded_bytes.len + 1]u8 = undefined;
blob_data[0] = 5;
@memcpy(blob_data[1..], neg_10_to_10_encoded_bytes[0..]);
var blob = MemoryBlob{ .data = &blob_data };

var decoder = Decoder.init();
try decoder.begin(&blob);
var expected: i64 = -10;
while (expected <= 10) {
const v = try decoder.read(&blob);
try testing.expectEqual(expected, v);
decoder.next(1);
expected += 1;
}
}

test "bit packed int: round trip" {
const MemoryBlob = @import("../../MemoryBlob.zig");
const allocator = testing.allocator;

const bit_widths = [_]u6{ 1, 2, 5, 7, 8, 30, 32, 63 };

for (&bit_widths) |bit_width| {
// TODO this isn't max value but number of encodable numbers (centered at 0)
const min_value: i64 = @intCast(@max(
-10_000,
-@divTrunc(std.math.pow(i128, 2, @intCast(bit_width)), 2),
));
const max_value: i64 = @intCast(@min(
10_000,
@divTrunc(std.math.pow(i128, 2, @intCast(bit_width)), 2) - 1,
));

var data = try std.ArrayList(u8).initCapacity(allocator, 17);
defer data.deinit();
const writer = data.writer();

var encoder = Encoder.init(bit_width);
const cont = try encoder.begin(writer);
try testing.expectEqual(true, cont);
var v: i64 = min_value;
while (v <= max_value) {
try encoder.write(writer, v);
v += 1;
}
try encoder.end(writer);

var blob = MemoryBlob{ .data = data.items };

var decoder = Decoder.init();
try decoder.begin(&blob);
var expected: i64 = min_value;
while (expected <= max_value) {
const value = try decoder.read(&blob);
try testing.expectEqual(expected, value);
decoder.next(1);
expected += 1;
}
}
}
6 changes: 4 additions & 2 deletions src/stripe/encode/direct.zig
Original file line number Diff line number Diff line change
Expand Up @@ -96,14 +96,16 @@ pub fn Decoder(
return;
}

// TODO more testing needed
var bytes_dest: []u8 = undefined;
bytes_dest.len = dst.len * @sizeOf(Value);
bytes_dest.ptr = @ptrCast(dst.ptr);
try blob.readAt(bytes_dest[0..], self.index * @sizeOf(Value));
for (dst, 0..) |*v, idx| {
const start = idx * @sizeOf(Value);
v.* = fromBytes(@as(*const [@sizeOf(Value)]u8, @ptrCast(bytes_dest[start..(start + @sizeOf(Value))])));
v.* = fromBytes(@as(
*const [@sizeOf(Value)]u8,
@ptrCast(bytes_dest[start..(start + @sizeOf(Value))]),
));
}
}
};
Expand Down
Loading

0 comments on commit cd47ec8

Please sign in to comment.