/
cursor.go
1501 lines (1312 loc) · 41 KB
/
cursor.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright 2022 Molecula Corp. (DBA FeatureBase).
// SPDX-License-Identifier: Apache-2.0
package rbf
import (
"encoding/binary"
"fmt"
"io"
"sort"
"unsafe"
"github.com/gernest/roaring"
"github.com/pkg/errors"
)
const (
BitmapN = (1 << 16) / 64
)
// Cursor represents an object for traversing forward and backward in the
// keyspace. Users should initialize a cursor with either First(), Last(), or
// Seek() and then move forward or backward using Next() or Prev(), respectively.
//
// If you mutate the b-tree that a cursor is attached to then you'll need to
// re-initialize the cursor. This may be changed in the future though.
//
// Cursors can be reused in some cases; if you're keeping a cursor around,
// be sure to nil out the tx, or it will hold a reference to it.
type Cursor struct {
tx *Tx
buffered bool // if true, Next() and Prev() do not move the cursor position
// buffers
array [ArrayMaxSize + 1]uint16
rle [RLEMaxSize + 1]roaring.Interval16
leafCells [PageSize / 8]leafCell
// stack holds branches
stack searchStack
}
type searchStack struct {
top int // current depth in elems
elems [32]stackElem
}
func runAdd(runs []roaring.Interval16, v uint16) ([]roaring.Interval16, bool) {
i := sort.Search(len(runs),
func(i int) bool { return runs[i].Last >= v })
if i == len(runs) {
i--
}
iv := runs[i]
if v >= iv.Start && iv.Last >= v {
return nil, false
}
if iv.Last < v {
if iv.Last == v-1 {
runs[i].Last++
} else {
runs = append(runs, roaring.Interval16{Start: v, Last: v})
}
} else if v+1 == iv.Start {
// combining two intervals
if i > 0 && runs[i-1].Last == v-1 {
runs[i-1].Last = iv.Last
runs = append(runs[:i], runs[i+1:]...)
//TODO check if too big
return runs, true
}
// just before an interval
runs[i].Start--
} else if i > 0 && v-1 == runs[i-1].Last {
// just after an interval
runs[i-1].Last++
} else {
// alone
newIv := roaring.Interval16{Start: v, Last: v}
runs = append(runs[:i], append([]roaring.Interval16{newIv}, runs[i:]...)...)
}
return runs, true
}
// maybeConvertOversizedRunToBitmap is only called by Cursor.Add().
// bitN must be the correct new bit count.
func maybeConvertOversizedRunToBitmap(runs []roaring.Interval16, bitN int, key uint64) leafCell {
if len(runs) >= RLEMaxSize {
//convertToBitmap
bitmap := make([]uint64, BitmapN)
for _, iv := range runs {
w1, w2 := iv.Start/64, iv.Last/64
b1, b2 := iv.Start&63, iv.Last&63
// a mask for everything under bit X looks like
// (1 << x) - 1. Say b1 is 4; our mask will want
// to have the bottom 4 bits be zero, so we shift
// left 4, getting 10000, then subtract 1, and
// get 01111, which is the mask to *remove*.
m1 := (uint64(1) << b1) - 1
// inclusive mask: same thing, then shift left 1 and
// or in 1. So for 4, we'd get 011111, which is the
// mask to *keep*.
m2 := (((uint64(1) << b2) - 1) << 1) | 1
if w1 == w2 {
// If we only had bit 4 in the range, this would
// end up being 011111 &^ 01111, or 010000.
bitmap[w1] |= (m2 &^ m1)
continue
}
// for w2, the "To" field, we want to set the bottom N
// bits. For w1, the "From" word, we want to set all *but*
// the bottom N bits.
bitmap[w2] |= m2
bitmap[w1] |= ^m1
words := bitmap[w1+1 : w2]
// set every bit between them
for i := range words {
words[i] = ^uint64(0)
}
}
// note that ElemN should be left 0 for ContainerTypeBitmap.
return leafCell{Key: key, BitN: int(bitN), Type: ContainerTypeBitmap, Data: fromArray64(bitmap)}
}
return leafCell{Key: key, ElemN: len(runs), BitN: int(bitN), Type: ContainerTypeRLE, Data: fromInterval16(runs)}
}
// Add sets a bit on the underlying bitmap.
// If changed is true, this cursor will need to be reinitialized before use.
func (c *Cursor) Add(v uint64) (changed bool, err error) {
hi, lo := highbits(v), lowbits(v)
// Move cursor to the key of the container.
// Insert new container if it doesn't exist.
if exact, err := c.Seek(hi); err != nil {
return false, err
} else if !exact {
return true, c.putLeafCell(leafCell{Key: hi, Type: ContainerTypeArray, ElemN: 1, BitN: 1, Data: fromArray16([]uint16{lo})})
}
// If the container exists and bit is not set then update the page.
elem := &c.stack.elems[c.stack.top]
leafPage, _, err := c.tx.readPage(elem.pgno)
if err != nil {
return false, err
}
cell := readLeafCell(leafPage, elem.index)
switch cell.Type {
case ContainerTypeArray:
// Exit if value exists in array container.
a := toArray16(cell.Data)
i, ok := arrayIndex(a, lo)
if ok {
return false, nil
}
// Copy container data and insert new value.
other := c.array[:len(a)+1]
copy(other, a[:i])
other[i] = lo
copy(other[i+1:], a[i:])
return true, c.putLeafCell(leafCell{Key: cell.Key, Type: ContainerTypeArray, ElemN: len(other), BitN: cell.BitN + 1, Data: fromArray16(other)})
case ContainerTypeRLE:
runs := toInterval16(cell.Data)
copy(c.rle[:], runs)
run, added := runAdd(c.rle[:len(runs)], lo)
if added {
leaf := maybeConvertOversizedRunToBitmap(run, cell.BitN+1, cell.Key)
return true, c.putLeafCell(leaf)
}
return false, nil
case ContainerTypeBitmapPtr:
// Exit if bit set in bitmap container.
pgno, bm, err := c.tx.leafCellBitmap(toPgno(cell.Data))
if err != nil {
return false, errors.Wrap(err, "cursor.Add")
}
a := cloneArray64(bm)
if a[lo/64]&(1<<uint64(lo%64)) != 0 {
return false, nil
}
// Insert new value and rewrite page.
a[lo/64] |= 1 << uint64(lo%64)
if err := c.tx.writeBitmapPage(pgno, fromArray64(a)); err != nil {
return false, err
}
// Update with new BitN.
cell.BitN++
return true, c.putLeafCell(cell)
default:
return false, fmt.Errorf("rbf.Cursor.Add(): invalid container type: %d", cell.Type)
}
}
// Remove unsets a bit on the underlying bitmap.
// If changed is true, the cursor will need to be reinitialized before use.
func (c *Cursor) Remove(v uint64) (changed bool, err error) {
hi, lo := highbits(v), lowbits(v)
// Move cursor to the key of the container.
// Exit if container does not exist.
if exact, err := c.Seek(hi); err != nil {
return false, err
} else if !exact {
return false, nil
}
// If the container exists and bit is not set then update the page.
elem := &c.stack.elems[c.stack.top]
leafPage, _, err := c.tx.readPage(elem.pgno)
if err != nil {
return false, err
}
cell := readLeafCell(leafPage, elem.index)
switch cell.Type {
case ContainerTypeArray:
// Exit if value does not exists in array container.
a := toArray16(cell.Data)
i, ok := arrayIndex(a, lo)
if !ok {
return false, nil
} else if len(a) == 1 {
return true, c.deleteLeafCell(cell.Key)
}
// Copy container data and remove new value.
other := c.array[:len(a)-1]
copy(other[:i], a[:i])
copy(other[i:], a[i+1:])
return true, c.putLeafCell(leafCell{Key: cell.Key, Type: ContainerTypeArray, ElemN: len(other), BitN: cell.BitN - 1, Data: fromArray16(other)})
case ContainerTypeRLE:
r := toInterval16(cell.Data)
i, contains := roaring.BinSearchRuns(lo, r)
if !contains {
return false, nil
}
// INVAR: lo is in run[i]
copy(c.rle[:], r)
runs := c.rle[:len(r)]
if lo == runs[i].Last && lo == runs[i].Start {
runs = append(runs[:i], runs[i+1:]...)
} else if lo == runs[i].Last {
runs[i].Last--
} else if lo == c.rle[i].Start {
runs[i].Start++
} else if lo > runs[i].Start {
// INVAR: Start < lo < Last.
// We remove lo, so split into two runs:
last := runs[i].Last
runs[i].Last = lo - 1
// INVAR: runs[:i] is correct, but still need to insert the new interval at i+1.
runs = append(runs, roaring.Interval16{})
// copy the tail first
copy(runs[i+2:], runs[i+1:])
// overwrite with the new interval.
runs[i+1] = roaring.Interval16{Start: lo + 1, Last: last}
}
if len(runs) == 0 {
return true, c.deleteLeafCell(cell.Key)
}
return true, c.putLeafCell(leafCell{Key: cell.Key, Type: ContainerTypeRLE, ElemN: len(runs), BitN: cell.BitN - 1, Data: fromInterval16(runs)})
case ContainerTypeBitmapPtr:
pgno, bm, err := c.tx.leafCellBitmap(toPgno(cell.Data))
if err != nil {
return false, errors.Wrap(err, "cursor.add")
}
a := cloneArray64(bm)
if a[lo/64]&(1<<uint64(lo%64)) == 0 {
// not present.
return false, nil
}
// clear the bit
a[lo/64] &^= 1 << uint64(lo%64)
cell.BitN--
if cell.BitN == 0 {
if err := c.tx.freePgno(pgno); err != nil {
return false, err
}
return true, c.deleteLeafCell(cell.Key)
}
// shrink if we've gotten small.
if cell.BitN <= ArrayMaxSize {
cbm := roaring.NewContainerBitmap(cell.BitN, a)
// convert to array
cbm = roaring.Optimize(cbm)
leafCell1 := ConvertToLeafArgs(cell.Key, cbm)
// ConvertToLeafArgs returns leafCell1 with BitN and ElemN updated.
return true, c.putLeafCell(leafCell1)
}
// rewrite page, still as a bitmap.
if err := c.tx.writeBitmapPage(pgno, fromArray64(a)); err != nil {
return false, err
}
return true, c.putLeafCell(cell)
default:
return false, fmt.Errorf("rbf.Cursor.Add(): invalid container type: %d", cell.Type)
}
}
// Contains returns true if a bit is set on the underlying bitmap.
func (c *Cursor) Contains(v uint64) (exists bool, err error) {
hi, lo := highbits(v), lowbits(v)
// Move cursor to the key of the container.
if exact, err := c.Seek(hi); err != nil {
return false, err
} else if !exact {
return false, nil
}
// If the container exists then check for low bits existence.
elem := &c.stack.elems[c.stack.top]
leafPage, _, err := c.tx.readPage(elem.pgno)
if err != nil {
return false, err
}
cell := readLeafCell(leafPage, elem.index)
switch cell.Type {
case ContainerTypeArray:
a := toArray16(cell.Data)
_, ok := arrayIndex(a, lo)
return ok, nil
case ContainerTypeRLE:
a := toInterval16(cell.Data)
i := int32(sort.Search(len(a),
func(i int) bool { return a[i].Last >= lo }))
if i < int32(len(a)) {
return (lo >= a[i].Start) && (lo <= a[i].Last), nil
}
return false, nil
case ContainerTypeBitmapPtr:
_, a, err := c.tx.leafCellBitmap(toPgno(cell.Data))
if err != nil {
return false, errors.Wrap(err, "cursor.Contains")
}
return a[lo/64]&(1<<uint64(lo%64)) != 0, err
default:
return false, fmt.Errorf("rbf.Cursor.Contains(): invalid container type: %d", cell.Type)
}
}
func fromPgno(val uint32) []byte {
buf := make([]byte, 4)
binary.LittleEndian.PutUint32(buf, val)
// return (*[4]byte)(unsafe.Pointer(&val))[:]
return buf
}
func toPgno(val []byte) uint32 {
return binary.LittleEndian.Uint32(val)
}
// putLeafCell inserts or updates a cell into the currently position leaf page
// in the stack. If in.Key exists in the page, the cell is updated. Otherwise,
// cell is inserted.
//
// This method has two paths. First, a fast path is used if we know the insert
// will not cause the page to grow past the max page size. This path simply
// shifts bytes around in the page to make room for the new cell. The second
// path is the slow path where a page may split into two pages. This causes the
// entire page to be deserialized so that pages can more easily be split.
//
// The slow path may cause a cascade of changes to parent pages as a page split
// creates a new entry in the parent. If that entry overflows the parent then
// the parent branch page will split which could cascade up to the root. If the
// root page splits, a new branch page is created but retains the original root
// page number so that the root records do not need to be updated.
func (c *Cursor) putLeafCell(in leafCell) (err error) {
elem := &c.stack.elems[c.stack.top]
leafPage, isHeap, err := c.tx.readPage(elem.pgno) // the last read leaf page
if err != nil {
return err
}
cellN := readCellN(leafPage)
// Determine if the insert/update will overflow the page.
// If it doesn't then we can do an optimized write where we don't deserialize.
isInsert := elem.index >= cellN || pageKeyAt(leafPage, elem.index) != in.Key
newEstPageSize := leafPageSize(leafPage)
if isInsert {
newEstPageSize += in.Size() + leafCellIndexElemSize
} else {
newEstPageSize += in.Size() - len(readLeafCellBytesAtOffset(leafPage, readCellOffset(leafPage, elem.index)))
}
// Use the fast path if we are not splitting pages and the container types are the same.
useFast := newEstPageSize+16 <= PageSize
if useFast && !isInsert {
if prev := readLeafCell(leafPage, elem.index); prev.Type != in.Type {
useFast = false
}
}
// Use an optimized routine to insert the leaf cell if we won't overflow.
// We pad the estimate with 16 bytes because we do 8-byte alignment of
// both the cell and the index.
if useFast {
return c.putLeafCellFast(in, isInsert)
}
cells := readLeafCells(leafPage, c.leafCells[:])
cell := in
if isInsert {
//new cell
if in.Type == ContainerTypeBitmap {
//allocated bitmap()
bitmapPgno, err := c.tx.allocatePgno()
if err != nil {
return err
}
cell.Data = fromPgno(bitmapPgno)
cell.Type = ContainerTypeBitmapPtr
cell.BitN = in.BitN
cell.ElemN = in.ElemN
}
// Shift cells over if this is an insertion.
cells = append(cells, leafCell{})
copy(cells[elem.index+1:], cells[elem.index:])
} else {
// FB-1239: Free bitmap page if replaced container is a bitmap pointer.
prev := cells[elem.index]
if prev.Type == ContainerTypeBitmapPtr {
if in.Type == ContainerTypeBitmapPtr {
if toPgno(in.Data) != toPgno(prev.Data) { // bptr-to-bptr with different bitmap pages
if err := c.tx.freePgno(toPgno(prev.Data)); err != nil {
return err
}
}
} else if in.Type != ContainerTypeBitmap {
if err := c.tx.freePgno(toPgno(prev.Data)); err != nil {
return err
}
}
}
if in.Type == ContainerTypeBitmap {
cell = cells[elem.index]
if cell.Type != ContainerTypeBitmapPtr {
bitmapPgno, err := c.tx.allocatePgno()
if err != nil {
return errors.Wrap(err, "cursor.putLeafCell")
}
cell.Type = ContainerTypeBitmapPtr
cell.Data = fromPgno(bitmapPgno)
cell.ElemN = in.ElemN
}
// update the BitN regardless
cell.BitN = in.BitN
}
}
if in.Type == ContainerTypeArray && in.ElemN > ArrayMaxSize {
//convert to bitmap
in.Type = ContainerTypeBitmap
a := make([]uint64, PageSize/8)
for _, v := range toArray16(in.Data) {
a[v/64] |= 1 << uint64(v%64)
}
in.Data = fromArray64(a)
cell.Type = ContainerTypeBitmapPtr
bitmapPgno, err := c.tx.allocatePgno()
if err != nil {
return err
}
cell.Data = fromPgno(bitmapPgno)
}
cells[elem.index] = cell
// Split into multiple pages if page size is exceeded.
groups := [][]leafCell{cells}
sz := leafCellsPageSize(cells)
if sz >= PageSize {
groups = splitLeafCells(cells)
}
// Write each group to a separate page.
newRoot := (len(groups) > 1) && (c.stack.top == 0)
parents := make([]branchCell, 0, len(groups))
origPgno := elem.pgno
// newRoot if split occured and bottom of the stack
for i, group := range groups {
// First page should overwrite the original.
// Subsequent pages should allocate new pages.
parent := branchCell{LeftKey: group[0].Key} //<<< this is the key spot for making sure that key is correct
if i == 0 && !newRoot {
parent.ChildPgno = origPgno
} else {
if parent.ChildPgno, err = c.tx.allocatePgno(); err != nil {
return fmt.Errorf("cannot allocate leaf: %w", err)
}
}
// if the cell is a bitmap write out its page
if in.Type == ContainerTypeBitmap {
var bm [PageSize]byte
copy(bm[:], fromArray64(toArray64(in.Data)))
if err = c.tx.writeBitmapPage(toPgno(cell.Data), bm[:]); err != nil {
return errors.Wrap(err, "putLeafCell writing bitmap page")
}
}
var buf [PageSize]byte
// Write cells to page.
writePageNo(buf[:], parent.ChildPgno)
writeFlags(buf[:], PageTypeLeaf)
writeCellN(buf[:], len(group))
offset := dataOffset(len(group))
x := 0
for j, cell := range group {
writeLeafCell(buf[:], j, offset, cell)
offset += align8(cell.Size())
x++
}
if err := c.tx.writePage(buf[:]); err != nil {
return err
}
parents = append(parents, parent)
}
// Free the source page once we've finished with it if it is on heap.
if isHeap {
freePage(leafPage)
}
// TODO(BBJ): Update page in buffer & cursor stack.
// If this is not a split then exit now.
if len(groups) == 1 {
return nil
}
// Initialize a new root if we are currently the root page.
if c.stack.top == 0 {
assert(newRoot) // leaf write must be root when stack at root
return c.writeRoot(origPgno, parents)
}
assert(!newRoot) // leaf write must NOT be root when stack not at root
// Otherwise update existing parent.
return c.putBranchCells(c.stack.top-1, parents)
}
// putLeafCellFast quickly insert or updates a cell on a leaf page.
// It works by shifting bytes around instead of deserializing. This must not overflow.
func (c *Cursor) putLeafCellFast(in leafCell, isInsert bool) (err error) {
elem := &c.stack.elems[c.stack.top]
src, isHeap, err := c.tx.readPage(elem.pgno)
if err != nil {
return err
}
srcCellN := readCellN(src)
// Determine the cell count of the new page.
dstCellN := srcCellN
if isInsert {
dstCellN++
}
// Write page header.
dst := allocPage()
writePageNo(dst, readPageNo(src))
writeFlags(dst, PageTypeLeaf)
writeCellN(dst, dstCellN)
// Copy data before index.
offset := dataOffset(dstCellN)
if elem.index > 0 {
srcStart := dataOffset(srcCellN)
srcEnd := readCellEndingOffset(src, elem.index-1)
shiftN := offset - srcStart
copy(dst[offset:], src[srcStart:srcEnd])
offset += align8(srcEnd - srcStart)
// Rewrite initial index slots by position moved.
for i := 0; i < elem.index; i++ {
writeCellOffset(dst, i, readCellOffset(src, i)+shiftN)
}
}
// Insert new row.
writeLeafCell(dst[:], elem.index, offset, in)
offset += align8(in.Size())
// Copy data after inserted element.
if (isInsert && elem.index < srcCellN) || (!isInsert && elem.index < srcCellN-1) {
var srcStart int
if isInsert {
srcStart = readCellOffset(src, elem.index)
} else {
srcStart = readCellOffset(src, elem.index+1)
}
srcEnd := readCellEndingOffset(src, srcCellN-1)
copy(dst[offset:], src[srcStart:srcEnd])
// Rewrite ending index slots by position moved.
shiftN := offset - srcStart
for i := elem.index + 1; i < dstCellN; i++ {
srci := i
if isInsert {
srci--
}
writeCellOffset(dst, i, readCellOffset(src, srci)+shiftN)
}
}
// Write new page to dirty page cache.
if err := c.tx.writePage(dst); err != nil {
return err
}
// Free page if on heap.
if isHeap {
freePage(src)
}
return nil
}
// deleteLeafCell removes a cell from the currently positioned page & index.
// If the removal causes the leaf page to have no more elements then its entry
// is removed from the parent. If the removal changes the first entry in the
// leaf page then the entry will be updated in the parent branch page.
func (c *Cursor) deleteLeafCell(key uint64) (err error) {
elem := &c.stack.elems[c.stack.top]
leafPage, _, err := c.tx.readPage(elem.pgno)
if err != nil {
return err
}
cells := readLeafCells(leafPage, c.leafCells[:])
oldPageKey := cells[0].Key
cell := readLeafCell(leafPage, elem.index)
if cell.Type == ContainerTypeBitmapPtr {
if err := c.tx.freePgno(toPgno(cell.Data)); err != nil {
return err
}
}
// If no more cells exist and we have a parent, remove from parent.
if c.stack.top > 0 && len(cells) == 1 {
if err := c.tx.freePgno(elem.pgno); err != nil {
return err
}
return c.deleteBranchCell(c.stack.top-1, cells[0].Key)
}
// Remove matching cell from list.
copy(cells[elem.index:], cells[elem.index+1:])
cells[len(cells)-1] = leafCell{}
cells = cells[:len(cells)-1]
// Write cells to page.
buf := allocPage()
writePageNo(buf[:], elem.pgno)
writeFlags(buf[:], PageTypeLeaf)
writeCellN(buf[:], len(cells))
offset := dataOffset(len(cells))
for j, cell := range cells {
writeLeafCell(buf[:], j, offset, cell)
offset += align8(cell.Size())
}
if err := c.tx.writePage(buf[:]); err != nil {
return err
}
// Update the parent's reference key if it's changed.
if c.stack.top > 0 && oldPageKey != cells[0].Key {
return c.updateBranchCell(c.stack.top-1, cells[0].Key)
}
return nil
}
// putBranchCells updates a branch page with one or more cells. If the cells
// cause the branch page to split then a new entry will be created in the
// parent page. If this branch page is a root then a new root will be created.
func (c *Cursor) putBranchCells(stackIndex int, newCells []branchCell) (err error) {
elem := &c.stack.elems[stackIndex]
// Read branch page from disk. The current buffer is the leaf page.
page, _, err := c.tx.readPage(elem.pgno)
if err != nil {
return err
}
cells := readBranchCells(page)
if len(cells) == 0 {
cells = make([]branchCell, 1)
}
// Update current cell & insert additional cells after it.
cells[elem.index] = newCells[0]
if len(newCells) > 1 {
cells = append(cells, make([]branchCell, len(newCells)-1)...)
copy(cells[elem.index+len(newCells):], cells[elem.index+1:])
copy(cells[elem.index+1:], newCells[1:])
}
// Split into multiple pages if page size is exceeded.
groups := [][]branchCell{cells}
if branchCellsPageSize(cells) > PageSize {
groups = splitBranchCells(cells)
}
// Write each group to a separate page.
parents := make([]branchCell, 0, len(groups))
origPgno := readPageNo(page)
newRoot := len(groups) > 1 && stackIndex == 0
for i, group := range groups {
// First page should overwrite the original.
// Subsequent pages should allocate new pages.
parent := branchCell{LeftKey: group[0].LeftKey}
if i == 0 && !newRoot {
parent.ChildPgno = origPgno
} else {
if parent.ChildPgno, err = c.tx.allocatePgno(); err != nil {
return fmt.Errorf("cannot allocate branch: %w", err)
}
}
parents = append(parents, parent)
// Write cells to page.
var buf [PageSize]byte
writePageNo(buf[:], parents[i].ChildPgno)
writeFlags(buf[:], PageTypeBranch)
writeCellN(buf[:], len(group))
offset := dataOffset(len(group))
for j, cell := range group {
writeBranchCell(buf[:], j, offset, cell)
offset += align8(branchCellSize)
}
if err := c.tx.writePage(buf[:]); err != nil {
return err
}
}
// TODO(BBJ): Update page in buffer & cursor stack.
// If this is not a split, then exit now.
if len(groups) == 1 {
return nil
}
// Initialize a new root if we are currently the root page.
if stackIndex == 0 {
assert(newRoot) // branch write must be root when stack at root
return c.writeRoot(origPgno, parents)
}
assert(!newRoot) // branch write must NOT be root when stack at root
// Otherwise update existing parent.
return c.putBranchCells(stackIndex-1, parents)
}
// updateBranchCell updates the key for cell in the branch. Branch elements
// are fixed size so this cannot cause a page split.
func (c *Cursor) updateBranchCell(stackIndex int, newKey uint64) (err error) {
elem := &c.stack.elems[stackIndex]
// Read branch page from disk. The current buffer is the leaf page.
page, _, err := c.tx.readPage(elem.pgno)
if err != nil {
return err
}
cells := readBranchCells(page)
oldPageKey := cells[0].LeftKey
// Update key in branch cell.
cells[elem.index].LeftKey = newKey
// Write cells to page.
var buf [PageSize]byte
writePageNo(buf[:], elem.pgno)
writeFlags(buf[:], PageTypeBranch)
writeCellN(buf[:], len(cells))
offset := dataOffset(len(cells))
for j, cell := range cells {
writeBranchCell(buf[:], j, offset, cell)
offset += align8(branchCellSize)
}
if err := c.tx.writePage(buf[:]); err != nil {
return err
}
if stackIndex > 0 && oldPageKey != cells[0].LeftKey {
return c.updateBranchCell(stackIndex-1, cells[0].LeftKey)
}
return nil
}
// deleteBranchCell removes a cell from a branch page. If no more elements
// exist in a branch page then it is removed. If an empty branch page is the
// root page then it is converted to an empty leaf page as branch pages are not
// allowed to be empty.
func (c *Cursor) deleteBranchCell(stackIndex int, key uint64) (err error) {
elem := &c.stack.elems[stackIndex]
// Read branch page from disk. The current buffer is the leaf page.
page, _, err := c.tx.readPage(elem.pgno)
if err != nil {
return err
}
cells := readBranchCells(page)
oldPageKey := cells[0].LeftKey
// Remove cell from branch.
copy(cells[elem.index:], cells[elem.index+1:])
cells[len(cells)-1] = branchCell{}
cells = cells[:len(cells)-1]
// Branches are not allowed to have zero element so we must remove the page
// or, in the case of the root page, convert to a leaf page.
if len(cells) == 0 {
// If this is the root page, convert to leaf page.
if stackIndex == 0 {
var buf [PageSize]byte
writePageNo(buf[:], elem.pgno)
writeFlags(buf[:], PageTypeLeaf)
writeCellN(buf[:], len(cells))
return c.tx.writePage(buf[:])
}
// If this is a non-root page, free and remove from parent.
if err := c.tx.freePgno(elem.pgno); err != nil {
return err
}
return c.deleteBranchCell(stackIndex-1, oldPageKey)
}
// If the root only has one node, replace it with its child.
if stackIndex == 0 && len(cells) == 1 {
target, _, err := c.tx.readPage(cells[0].ChildPgno)
if err != nil {
return err
}
buf := allocPage()
copy(buf, target)
writePageNo(buf[:], elem.pgno)
if err := c.tx.freePgno(cells[0].ChildPgno); err != nil {
return err
}
return c.tx.writePage(buf[:])
}
// Write cells to page.
var buf [PageSize]byte
writePageNo(buf[:], elem.pgno)
writeFlags(buf[:], PageTypeBranch)
writeCellN(buf[:], len(cells))
offset := dataOffset(len(cells))
for j, cell := range cells {
writeBranchCell(buf[:], j, offset, cell)
offset += align8(branchCellSize)
}
assert(readCellN(buf[:]) > 0) // must have at least one cell
if err := c.tx.writePage(buf[:]); err != nil {
return err
}
if stackIndex > 0 && len(cells) > 0 && oldPageKey != cells[0].LeftKey {
return c.updateBranchCell(stackIndex-1, cells[0].LeftKey)
}
return nil
}
// writeRoot writes a new branch page at the root with the given cells.
// The root of a b-tree should always stay the same, even in the case of a
// split. This allows us to not rewrite the root record for the b-tree.
func (c *Cursor) writeRoot(pgno uint32, cells []branchCell) error {
var buf [PageSize]byte
writePageNo(buf[:], pgno)
writeFlags(buf[:], PageTypeBranch)
writeCellN(buf[:], len(cells))
offset := dataOffset(len(cells))
for i := range cells {
writeBranchCell(buf[:], i, offset, cells[i])
offset += align8(branchCellSize)
}
return c.tx.writePage(buf[:])
}
// splitLeafCells splits cells into roughly equal parts. It's a naive
// implementation that splits cells whenever a page is 60% full.
func splitLeafCells(cells []leafCell) [][]leafCell {
slices := make([][]leafCell, 1, 2)
var dataSize int
for _, cell := range cells {
if cell.Type == ContainerTypeBitmap {
panic("no! all ContainerTypeBitmap should be ContainerTypeBitmapPtr by now")
}
// Determine number of cells on current slice & cell size.
cellN := len(slices[len(slices)-1])
sz := align8(leafCellHeaderSize + len(cell.Data))
// If there is at least one cell on the slice & we've exceeded
// half a page then create a new group of cells.
thresh := int(float64(PageSize) * globalBranchFillPct)
if cellN != 0 && (dataOffset(cellN+1)+dataSize+sz) > thresh {
slices, dataSize = append(slices, nil), 0
} else if cellN != 0 && cell.Type == ContainerTypeArray && cell.ElemN > ArrayMaxSize {
slices, dataSize = append(slices, nil), 0
sz = PageSize
}
// Append to current slice & increase total cell data size.
slices[len(slices)-1] = append(slices[len(slices)-1], cell)
dataSize += sz
}
return slices
}
var globalBranchFillPct = 0.60
// splitBranchCells splits cells into roughly equal parts. It's a naive
// implementation that splits cells whenever a page is 60% full.
func splitBranchCells(cells []branchCell) [][]branchCell {
slices := make([][]branchCell, 1, 2)
var dataSize int
for _, cell := range cells {
// Determine number of cells on current slice & cell size.
cellN := len(slices[len(slices)-1])
sz := align8(branchCellSize)
// If there is at least one cell on the slice & we've exceeded
// half a page then create a new group of cells.
thresh := int(float64(PageSize) * globalBranchFillPct)
if cellN != 0 && (dataOffset(cellN+1)+dataSize+sz) > thresh {
slices, dataSize = append(slices, nil), 0
}
// Append to current slice & increase total cell data size.
slices[len(slices)-1] = append(slices[len(slices)-1], cell)
dataSize += sz
}
return slices
}
// pageKeyAt returns the key at the given index of the page.
func pageKeyAt(page []byte, index int) uint64 {
offset := readCellOffset(page, index)
return *(*uint64)(unsafe.Pointer(&page[offset]))
}
// First moves to the first element of the btree. Returns io.EOF if no elements
// exist. The first call to Next() will not move the position but subsequent
// calls will move the position forward until it reaches the end and return io.EOF.
func (c *Cursor) First() error {
c.buffered = true
for c.stack.top = 0; ; c.stack.top++ {
elem := &c.stack.elems[c.stack.top]
buf, _, err := c.tx.readPage(elem.pgno)
if err != nil {
return err
}
switch typ := readFlags(buf); typ {
case PageTypeBranch:
elem.index = 0
if n := readCellN(buf); elem.index >= n { // branch cell index must less than cell count
return fmt.Errorf("branch cell index out of range: pgno=%d i=%d n=%d", elem.pgno, elem.index, n)
}