Skip to content

Commit

Permalink
Add FromUnsafeBytes to prevent small allocations caused by ByteInputA…
Browse files Browse the repository at this point in the history
…dapter

Signed-off-by: Xiaochao Dong (@damnever) <the.xcdong@gmail.com>
  • Loading branch information
damnever committed Aug 16, 2023
1 parent c3f199b commit 423c967
Show file tree
Hide file tree
Showing 4 changed files with 169 additions and 24 deletions.
19 changes: 19 additions & 0 deletions internal/byte_input.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,25 @@ type ByteBuffer struct {
off int
}

// NewByteBuffer creates a new ByteBuffer.
func NewByteBuffer(buf []byte) *ByteBuffer {
return &ByteBuffer{
buf: buf,
}
}

var _ io.Reader = (*ByteBuffer)(nil)

// Read implements io.Reader.
func (b *ByteBuffer) Read(p []byte) (int, error) {
data, err := b.Next(len(p))
if err != nil {
return 0, err
}
copy(p, data)
return len(data), nil
}

// Next returns a slice containing the next n bytes from the reader
// If there are fewer bytes than the given n, io.ErrUnexpectedEOF will be returned
func (b *ByteBuffer) Next(n int) ([]byte, error) {
Expand Down
45 changes: 28 additions & 17 deletions roaring.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ func (rb *Bitmap) ToBytes() ([]byte, error) {
func (rb *Bitmap) Checksum() uint64 {
const (
offset = 14695981039346656037
prime = 1099511628211
prime = 1099511628211
)

var bytes []byte
Expand Down Expand Up @@ -106,6 +106,14 @@ func (rb *Bitmap) Checksum() uint64 {
return hash
}

// FromUnsafeBytes reads a serialized version of this bitmap from the byte buffer without copy.
// It is the caller's responsibility to ensure that the input data is not modified and remains valid for the entire lifetime of this bitmap.
// This method avoids small allocations but holds references to the input data buffer. It is GC-friendly, but it may consume more memory eventually.
func (rb *Bitmap) FromUnsafeBytes(data []byte, cookieHeader ...byte) (p int64, err error) {
stream := internal.NewByteBuffer(data)
return rb.ReadFrom(stream)
}

// ReadFrom reads a serialized version of this bitmap from stream.
// The format is compatible with other RoaringBitmap
// implementations (Java, C) and is documented here:
Expand All @@ -114,12 +122,18 @@ func (rb *Bitmap) Checksum() uint64 {
// So add cookieHeader to accept the 4-byte data that has been read in roaring64.ReadFrom.
// It is not necessary to pass cookieHeader when call roaring.ReadFrom to read the roaring32 data directly.
func (rb *Bitmap) ReadFrom(reader io.Reader, cookieHeader ...byte) (p int64, err error) {
stream := internal.ByteInputAdapterPool.Get().(*internal.ByteInputAdapter)
stream.Reset(reader)
stream, ok := reader.(internal.ByteInput)
if !ok {
byteInputAdapter := internal.ByteInputAdapterPool.Get().(*internal.ByteInputAdapter)
byteInputAdapter.Reset(reader)
stream = byteInputAdapter
}

p, err = rb.highlowcontainer.readFrom(stream, cookieHeader...)
internal.ByteInputAdapterPool.Put(stream)

if !ok {
internal.ByteInputAdapterPool.Put(stream.(*internal.ByteInputAdapter))
}
return
}

Expand All @@ -144,7 +158,6 @@ func (rb *Bitmap) ReadFrom(reader io.Reader, cookieHeader ...byte) (p int64, err
// bitmap derived from this bitmap (e.g., via Or, And) might
// also be broken. Thus, before making buf unavailable, you should
// call CloneCopyOnWriteContainers on all such bitmaps.
//
func (rb *Bitmap) FromBuffer(buf []byte) (p int64, err error) {
stream := internal.ByteBufferPool.Get().(*internal.ByteBuffer)
stream.Reset(buf)
Expand Down Expand Up @@ -276,9 +289,9 @@ type intIterator struct {
// This way, instead of making up-to 64k allocations per full iteration
// we get a single allocation and simply reinitialize the appropriate
// iterator and point to it in the generic `iter` member on each key bound.
shortIter shortIterator
runIter runIterator16
bitmapIter bitmapContainerShortIterator
shortIter shortIterator
runIter runIterator16
bitmapIter bitmapContainerShortIterator
}

// HasNext returns true if there are more integers to iterate over
Expand Down Expand Up @@ -341,7 +354,6 @@ func (ii *intIterator) AdvanceIfNeeded(minval uint32) {
// IntIterator is meant to allow you to iterate through the values of a bitmap, see Initialize(a *Bitmap)
type IntIterator = intIterator


// Initialize configures the existing iterator so that it can iterate through the values of
// the provided bitmap.
// The iteration results are undefined if the bitmap is modified (e.g., with Add or Remove).
Expand All @@ -357,9 +369,9 @@ type intReverseIterator struct {
iter shortIterable
highlowcontainer *roaringArray

shortIter reverseIterator
runIter runReverseIterator16
bitmapIter reverseBitmapContainerShortIterator
shortIter reverseIterator
runIter runReverseIterator16
bitmapIter reverseBitmapContainerShortIterator
}

// HasNext returns true if there are more integers to iterate over
Expand Down Expand Up @@ -434,9 +446,9 @@ type manyIntIterator struct {
iter manyIterable
highlowcontainer *roaringArray

shortIter shortIterator
runIter runIterator16
bitmapIter bitmapContainerManyIterator
shortIter shortIterator
runIter runIterator16
bitmapIter bitmapContainerManyIterator
}

func (ii *manyIntIterator) init() {
Expand Down Expand Up @@ -495,7 +507,6 @@ func (ii *manyIntIterator) NextMany64(hs64 uint64, buf []uint64) int {
return n
}


// ManyIntIterator is meant to allow you to iterate through the values of a bitmap, see Initialize(a *Bitmap)
type ManyIntIterator = manyIntIterator

Expand Down Expand Up @@ -569,7 +580,7 @@ func (rb *Bitmap) Iterate(cb func(x uint32) bool) {
// Iterator creates a new IntPeekable to iterate over the integers contained in the bitmap, in sorted order;
// the iterator becomes invalid if the bitmap is modified (e.g., with Add or Remove).
func (rb *Bitmap) Iterator() IntPeekable {
p := new(intIterator)
p := new(intIterator)
p.Initialize(rb)
return p
}
Expand Down
68 changes: 68 additions & 0 deletions roaring64/roaring64.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"strconv"

"github.com/RoaringBitmap/roaring"
"github.com/RoaringBitmap/roaring/internal"
)

const serialCookieNoRunContainer = 12346 // only arrays and bitmaps
Expand Down Expand Up @@ -80,6 +81,73 @@ func (rb *Bitmap) WriteTo(stream io.Writer) (int64, error) {
return n, nil
}

// FromUnsafeBytes reads a serialized version of this bitmap from the byte buffer without copy.
// It is the caller's responsibility to ensure that the input data is not modified and remains valid for the entire lifetime of this bitmap.
// This method avoids small allocations but holds references to the input data buffer. It is GC-friendly, but it may consume more memory eventually.
func (rb *Bitmap) FromUnsafeBytes(data []byte) (p int64, err error) {
stream := internal.NewByteBuffer(data)

cookie, r32, p, err := tryReadFromRoaring32ByteBuffer(rb, stream)
if err != nil {
return p, err
} else if r32 {
return p, nil
}
// TODO: Add buffer interning as in base roaring package.

sizeBuf, err := stream.Next(4)
if err != nil {
return 0, fmt.Errorf("error in bitmap.UnsafeFromBytes: could not read number of containers: %w", err)
}
p += 4
sizeBuf = append(cookie, sizeBuf...)

size := binary.LittleEndian.Uint64(sizeBuf)
rb.highlowcontainer = roaringArray64{}
rb.highlowcontainer.keys = make([]uint32, size)
rb.highlowcontainer.containers = make([]*roaring.Bitmap, size)
rb.highlowcontainer.needCopyOnWrite = make([]bool, size)
for i := uint64(0); i < size; i++ {
keyBuf, err := stream.Next(4)
if err != nil {
return 0, fmt.Errorf("error in bitmap.UnsafeFromBytes: could not read key #%d: %w", i, err)
}
p += 4
rb.highlowcontainer.keys[i] = binary.LittleEndian.Uint32(keyBuf)
rb.highlowcontainer.containers[i] = roaring.NewBitmap()
n, err := rb.highlowcontainer.containers[i].ReadFrom(stream)
if n == 0 || err != nil {
return int64(n), fmt.Errorf("Could not deserialize bitmap for key #%d: %s", i, err)
}
p += int64(n)
}

return p, nil
}

func tryReadFromRoaring32ByteBuffer(rb *Bitmap, stream *internal.ByteBuffer) (cookie []byte, r32 bool, p int64, err error) {
// Verify the first two bytes are a valid MagicNumber.
cookie, err = stream.Next(4)
if err != nil {
return cookie, false, 0, err
}
fileMagic := int(binary.LittleEndian.Uint16(cookie[0:2]))
if fileMagic == serialCookieNoRunContainer || fileMagic == serialCookie {
bm32 := roaring.NewBitmap()
p, err = bm32.ReadFrom(stream, cookie...)
if err != nil {
return
}
rb.highlowcontainer = roaringArray64{
keys: []uint32{0},
containers: []*roaring.Bitmap{bm32},
needCopyOnWrite: []bool{false},
}
return cookie, true, p, nil
}
return
}

// ReadFrom reads a serialized version of this bitmap from stream.
// The format is compatible with other 64-bit RoaringBitmap
// implementations (Java, Go, C++) and it has a specification :
Expand Down
61 changes: 54 additions & 7 deletions roaring64/serialization_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package roaring64
import (
"bytes"
"fmt"
"io"
"os"
"runtime"
"testing"
Expand All @@ -21,12 +22,18 @@ func TestSerializationOfEmptyBitmap(t *testing.T) {

require.NoError(t, err)
assert.EqualValues(t, buf.Len(), rb.GetSerializedSizeInBytes())
data := buf.Bytes()

newrb := NewBitmap()
_, err = newrb.ReadFrom(buf)

require.NoError(t, err)
assert.True(t, rb.Equals(newrb))

newrb2 := NewBitmap()
_, err = newrb2.FromUnsafeBytes(data)
require.NoError(t, err)
assert.True(t, rb.Equals(newrb2))
}

func TestBase64_036(t *testing.T) {
Expand All @@ -51,26 +58,32 @@ func TestSerializationBasic037(t *testing.T) {

require.NoError(t, err)
assert.EqualValues(t, buf.Len(), rb.GetSerializedSizeInBytes())
data := buf.Bytes()

newrb := NewBitmap()
_, err = newrb.ReadFrom(buf)

require.NoError(t, err)
assert.True(t, rb.Equals(newrb))

newrb2 := NewBitmap()
_, err = newrb2.FromUnsafeBytes(data)
require.NoError(t, err)
assert.True(t, rb.Equals(newrb2))
}

func TestSerializationToFile038(t *testing.T) {
rb := BitmapOf(1, 2, 3, 4, 5, 100, 1000)
fname := "myfile.bin"
fout, err := os.OpenFile(fname, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0660)
if(err != nil) {
if err != nil {
fmt.Fprintf(os.Stderr, "\n\nIMPORTANT: For testing file IO, the roaring library requires disk access.\nWe omit some tests for now.\n\n")
return
}

var l int64
l, err = rb.WriteTo(fout)
if(err != nil) {
if err != nil {
fmt.Fprintf(os.Stderr, "\n\nIMPORTANT: For testing file IO, the roaring library requires disk access.\nWe omit some tests for now.\n\n")
return
}
Expand All @@ -81,18 +94,25 @@ func TestSerializationToFile038(t *testing.T) {

newrb := NewBitmap()
fin, err := os.Open(fname)
if(err != nil) {
if err != nil {
fmt.Fprintf(os.Stderr, "\n\nIMPORTANT: For testing file IO, the roaring library requires disk access.\nWe omit some tests for now.\n\n")
return
}
buf := bytes.NewBuffer(nil)
teer := io.TeeReader(fin, buf)

defer func() {
fin.Close()
_ = os.Remove(fname)
}()

_, _ = newrb.ReadFrom(fin)
_, _ = newrb.ReadFrom(teer)
assert.True(t, rb.Equals(newrb))

newrb2 := NewBitmap()
_, err = newrb2.FromUnsafeBytes(buf.Bytes())
require.NoError(t, err)
assert.True(t, rb.Equals(newrb2))
}

func TestSerializationBasic2_041(t *testing.T) {
Expand All @@ -104,12 +124,18 @@ func TestSerializationBasic2_041(t *testing.T) {

require.NoError(t, err)
assert.Equal(t, l, buf.Len())
data := buf.Bytes()

newrb := NewBitmap()
_, err = newrb.ReadFrom(buf)

require.NoError(t, err)
assert.True(t, rb.Equals(newrb))

newrb2 := NewBitmap()
_, err = newrb2.FromUnsafeBytes(data)
require.NoError(t, err)
assert.True(t, rb.Equals(newrb2))
}

// roaringarray.writeTo and .readFrom should serialize and unserialize when containing all 3 container types
Expand All @@ -124,12 +150,18 @@ func TestSerializationBasic3_042(t *testing.T) {

require.NoError(t, err)
assert.EqualValues(t, buf.Len(), int(rb.GetSerializedSizeInBytes()))
data := buf.Bytes()

newrb := NewBitmap()
_, err = newrb.ReadFrom(&buf)

require.NoError(t, err)
assert.True(t, newrb.Equals(rb))

newrb2 := NewBitmap()
_, err = newrb2.FromUnsafeBytes(data)
require.NoError(t, err)
assert.True(t, rb.Equals(newrb2))
}

func TestHoldReference(t *testing.T) {
Expand Down Expand Up @@ -172,7 +204,23 @@ func TestHoldReference(t *testing.T) {
})
}

func BenchmarkUnserializeFromUnsafeBytes(b *testing.B) {
benchmarkUnserializeFunc(b, "FromUnsafeBytes", func(bitmap *Bitmap, data []byte) (int64, error) {
copied := make([]byte, len(data))
copy(copied, data)
return bitmap.FromUnsafeBytes(copied)
})
}

func BenchmarkUnserializeReadFrom(b *testing.B) {
benchmarkUnserializeFunc(b, "ReadFrom", func(bitmap *Bitmap, data []byte) (int64, error) {
return bitmap.ReadFrom(bytes.NewReader(data))
})
}

func benchmarkUnserializeFunc(b *testing.B, name string, f func(*Bitmap, []byte) (int64, error)) {
b.Helper()

for _, size := range []uint64{650, 6500, 65000, 650000, 6500000} {
rb := New()
buf := &bytes.Buffer{}
Expand All @@ -187,15 +235,14 @@ func BenchmarkUnserializeReadFrom(b *testing.B) {
b.Fatalf("Unexpected error occurs: %v", err)
}

b.Run(fmt.Sprintf("ReadFrom-%d", size), func(b *testing.B) {
b.Run(fmt.Sprintf("%s-%d", name, size), func(b *testing.B) {
b.ReportAllocs()
b.StartTimer()

for n := 0; n < b.N; n++ {
reader := bytes.NewReader(buf.Bytes())
nb := New()

if _, err := nb.ReadFrom(reader); err != nil {
if _, err := f(nb, buf.Bytes()); err != nil {
b.Fatalf("Unexpected error occurs: %v", err)
}
}
Expand Down

0 comments on commit 423c967

Please sign in to comment.