Skip to content
This repository has been archived by the owner on Jul 18, 2024. It is now read-only.

fix(cleanup): Optimise cleanup and fix some bugs #25

Merged
merged 23 commits into from
Sep 30, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 75 additions & 28 deletions bitmap.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package sroar
import (
"fmt"
"math"
"sort"
"strings"
"sync"

Expand Down Expand Up @@ -47,7 +48,7 @@ func FromBuffer(data []byte) *Bitmap {
return NewBitmap()
}
du := toUint16Slice(data)
x := toUint64Slice(du[:4])[0]
x := toUint64Slice(du[:4])[indexNodeSize]
return &Bitmap{
data: du,
_ptr: data,
Expand All @@ -64,7 +65,7 @@ func FromBufferWithCopy(data []byte) *Bitmap {
dup := make([]byte, len(data))
copy(dup, data)
du := toUint16Slice(dup)
x := toUint64Slice(du[:4])[0]
x := toUint64Slice(du[:4])[indexNodeSize]

return &Bitmap{
data: du,
Expand Down Expand Up @@ -104,7 +105,7 @@ func NewBitmapWith(numKeys int) *Bitmap {
data: make([]uint16, 4*(2*numKeys+2)),
}
ra.keys = toUint64Slice(ra.data)
ra.keys.setAt(indexNodeSize, uint64(len(ra.data)))
ra.keys.setNodeSize(len(ra.data))

// Always generate a container for key = 0x00. Otherwise, node gets confused
// about whether a zero key is a new key or not.
Expand All @@ -131,12 +132,12 @@ func (ra *Bitmap) setKey(k uint64, offset uint64) uint64 {
curSize := uint64(len(ra.keys) * 4) // Multiply by 4 for U64 -> U16.
bySize := curSize
if bySize > math.MaxUint16 {
bySize = math.MaxInt16
bySize = math.MaxUint16
}

ra.scootRight(curSize, uint16(bySize))
ra.keys = toUint64Slice(ra.data[:curSize+bySize])
ra.keys.setAt(0, uint64(curSize+bySize))
ra.keys.setNodeSize(int(curSize + bySize))

// All containers have moved to the right by bySize bytes.
// Update their offsets.
Expand Down Expand Up @@ -195,24 +196,10 @@ func (ra *Bitmap) scootLeft(offset uint64, size uint64) {
ra.data = ra.data[:n-size]
}

func (ra *Bitmap) removeKey(idx int) {
off := uint64(4 * keyOffset(idx))
// remove 8 u16s, which corresponds to a key and value (two u64s)
ra.scootLeft(off, 8)
ra.keys.updateOffsets(off, 8, false)
ra.keys.setNumKeys(ra.keys.numKeys() - 1)
}

func (ra *Bitmap) removeContainer(off uint64) {
cont := ra.getContainer(off)
sz := uint64(cont[indexSize])
ra.scootLeft(off, sz)
ra.keys.updateOffsets(off, sz, false)
}

func (ra *Bitmap) newContainer(sz uint16) uint64 {
offset := uint64(len(ra.data))
ra.fastExpand(sz)
Memclr(ra.data[offset : offset+uint64(sz)])
ra.data[offset] = sz
return offset
}
Expand Down Expand Up @@ -546,6 +533,8 @@ func (ra *Bitmap) RemoveRange(lo, hi uint64) {
k1 := lo & mask
k2 := hi & mask

defer ra.Cleanup()

// Complete range lie in a single container
if k1 == k2 {
if off, has := ra.keys.getValue(k1); has {
Expand All @@ -563,8 +552,6 @@ func (ra *Bitmap) RemoveRange(lo, hi uint64) {
st++
}

defer ra.Cleanup()

for i := st; i < n; i++ {
key := ra.keys.key(i)
if key >= k2 {
Expand All @@ -585,7 +572,6 @@ func (ra *Bitmap) RemoveRange(lo, hi uint64) {
}
}

// There is nothing to remove in the last container.
if uint16(hi) == 0 {
return
}
Expand Down Expand Up @@ -1000,15 +986,76 @@ func (ra *Bitmap) Rank(x uint64) int {
}

func (ra *Bitmap) Cleanup() {
for idx := 1; idx < ra.keys.numKeys(); {
type interval struct {
start uint64
end uint64
}

// Find the ranges that needs to be removed in the key space and the container space. Also,
// start the iteration from idx = 1 because we never remove the 0 key.
var keyIntervals, contIntervals []interval
for idx := 1; idx < ra.keys.numKeys(); idx++ {
off := ra.keys.val(idx)
cont := ra.getContainer(off)
if getCardinality(cont) == 0 {
ra.removeContainer(off)
ra.removeKey(idx)
continue
ko := uint64(keyOffset(idx))
contIntervals = append(contIntervals, interval{off, off + uint64(cont[indexSize])})
keyIntervals = append(keyIntervals, interval{4 * ko, 4 * (ko + 2)})
}
}
if len(contIntervals) == 0 {
return
}

merge := func(intervals []interval) []interval {
assert(len(intervals) > 0)

// Merge the ranges in order to reduce scootLeft
merged := []interval{intervals[0]}
for _, ir := range intervals[1:] {
last := merged[len(merged)-1]
if ir.start == last.end {
last.end = ir.end
merged[len(merged)-1] = last
continue
}
merged = append(merged, ir)
}
idx++
return merged
}

// Key intervals are already sorted, but container intervals needs to be sorted because
// they are always added in the end of the ra.data.
sort.Slice(contIntervals, func(i, j int) bool {
return contIntervals[i].start < contIntervals[j].start
})

contIntervals = merge(contIntervals)
keyIntervals = merge(keyIntervals)

// Cleanup the containers.
moved := uint64(0)
for _, ir := range contIntervals {
assert(ir.start >= moved)
sz := ir.end - ir.start
ra.scootLeft(ir.start-moved, sz)
ra.keys.updateOffsets(ir.end-moved-1, sz, false)
moved += sz
}

// Cleanup the key space.
moved = uint64(0)
for _, ir := range keyIntervals {
assert(ir.start >= moved)
sz := ir.end - ir.start
ra.scootLeft(ir.start-moved, sz)

// sz is in number of u16s, hence number of key-value removed is sz/8.
ra.keys.setNumKeys(ra.keys.numKeys() - int(sz/8))
ra.keys.setNodeSize(ra.keys.size() - int(sz))
ra.keys = ra.keys[:len(ra.keys)-int(sz/4)]
ra.keys.updateOffsets(ir.end-moved-1, sz, false)
moved += sz
}
}

Expand Down
93 changes: 85 additions & 8 deletions bitmap_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -682,20 +682,81 @@ func TestExtremes(t *testing.T) {

func TestCleanup(t *testing.T) {
a := NewBitmap()
n := int(1e6)
n := 10

for i := 0; i < n; i++ {
a.Set(uint64(i))
a.Set(uint64((i * (1 << 16))))
}
abuf := a.ToBufferWithCopy()

require.Equal(t, 10, a.keys.numKeys())
a.RemoveRange(1<<16, 2*(1<<16))
require.Equal(t, 9, a.keys.numKeys())

a.RemoveRange(6*(1<<16), 8*(1<<16))
require.Equal(t, 7, a.keys.numKeys())

a = FromBufferWithCopy(abuf)
require.Equal(t, 10, a.keys.numKeys())
a.Remove(6 * (1 << 16))
a.RemoveRange(7*(1<<16), 9*(1<<16))
require.Equal(t, 7, a.keys.numKeys())

n = int(1e6)
b := NewBitmap()
for i := 0; i < n; i++ {
b.Set(uint64(i))
}
b.RemoveRange(0, uint64(n/2))
require.Equal(t, n/2, b.GetCardinality())
buf := b.ToBuffer()
b = FromBuffer(buf)
require.Equal(t, n/2, b.GetCardinality())
}

func TestCleanup2(t *testing.T) {
a := NewBitmap()
n := 10
for i := 0; i < n; i++ {
a.Set(uint64(i * (1 << 16)))
}
for i := 65536; i < n; i++ {
a.Remove(uint64(i))
require.Equal(t, n, a.GetCardinality())
require.Equal(t, n, a.keys.numKeys())

for i := 0; i < n; i++ {
if i%2 == 1 {
a.Remove(uint64(i * (1 << 16)))
}
}
require.Equal(t, n/2, a.GetCardinality())
require.Equal(t, n, a.keys.numKeys())

a.Cleanup()
for i := 0; i < 65535; i++ {
require.Truef(t, a.Contains(uint64(i)), "idx: %d", i)
require.Equal(t, n/2, a.GetCardinality())
require.Equal(t, n/2, a.keys.numKeys())
}

func TestCleanupSplit(t *testing.T) {
a := NewBitmap()
n := int(1e8)

for i := 0; i < n; i++ {
a.Set(uint64(i))
}
for i := 65536; i < n; i++ {
require.Falsef(t, a.Contains(uint64(i)), "idx: %d", i)

split := func() {
n := a.GetCardinality()
mid, err := a.Select(uint64(n / 2))
require.NoError(t, err)

b := a.Clone()
a.RemoveRange(0, mid)
b.RemoveRange(mid, math.MaxUint64)

require.Equal(t, n, a.GetCardinality()+b.GetCardinality())
}
for a.GetCardinality() > 1 {
split()
}
}

Expand Down Expand Up @@ -745,3 +806,19 @@ func TestRank(t *testing.T) {
}
}
}

func TestAnd2(t *testing.T) {
a := NewBitmap()
n := int(1e7)

for i := 0; i < n; i++ {
a.Set(uint64(i))
}
require.Equal(t, n, a.GetCardinality())
a.RemoveRange(0, uint64(n/2))

for i := 0; i < n; i++ {
a.Set(uint64(i))
}
require.Equal(t, n, a.GetCardinality())
}
4 changes: 3 additions & 1 deletion keys.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,10 @@ var (
type node []uint64

func keyOffset(i int) int { return indexNodeStart + 2*i }
func valOffset(i int) int { return indexNodeStart + 1 + 2*i }
func valOffset(i int) int { return indexNodeStart + 2*i + 1 }

func (n node) numKeys() int { return int(n[indexNumKeys]) }
func (n node) size() int { return int(n[indexNodeSize]) }
func (n node) maxKeys() int { return (len(n) - indexNodeStart) / 2 }
func (n node) key(i int) uint64 { return n[keyOffset(i)] }
func (n node) val(i int) uint64 { return n[valOffset(i)] }
Expand All @@ -29,6 +30,7 @@ func (n node) uint64(idx int) uint64 { return n[idx] }
func (n node) setAt(idx int, k uint64) { n[idx] = k }

func (n node) setNumKeys(num int) { n[indexNumKeys] = uint64(num) }
func (n node) setNodeSize(sz int) { n[indexNodeSize] = uint64(sz) }

func (n node) maxKey() uint64 {
idx := n.numKeys()
Expand Down