@@ -10,6 +10,7 @@ import (
10
10
"fmt"
11
11
"io"
12
12
"iter"
13
+ "maps"
13
14
"slices"
14
15
"sort"
15
16
@@ -20,6 +21,7 @@ import (
20
21
"github.com/cockroachdb/pebble/sstable"
21
22
"github.com/cockroachdb/pebble/sstable/blob"
22
23
"github.com/cockroachdb/pebble/sstable/block"
24
+ "github.com/cockroachdb/pebble/sstable/colblk"
23
25
)
24
26
25
27
// This file implements DB.CheckLevels() which checks that every entry in the
@@ -360,6 +362,7 @@ type checkConfig struct {
360
362
// blobValueFetcher is the ValueFetcher to use when retrieving values stored
361
363
// externally in blob files.
362
364
blobValueFetcher blob.ValueFetcher
365
+ fileCache * fileCacheHandle
363
366
}
364
367
365
368
// cmp is shorthand for comparer.Compare.
@@ -527,6 +530,7 @@ type CheckLevelsStats struct {
527
530
// - Point keys in sstables are ordered.
528
531
// - Range delete tombstones in sstables are ordered and fragmented.
529
532
// - Successful processing of all MERGE records.
533
+ // - Each sstable's blob reference liveness block is valid.
530
534
func (d * DB ) CheckLevels (stats * CheckLevelsStats ) error {
531
535
// Grab and reference the current readState.
532
536
readState := d .loadReadState ()
@@ -548,8 +552,9 @@ func (d *DB) CheckLevels(stats *CheckLevelsStats) error {
548
552
readEnv : block.ReadEnv {
549
553
// TODO(jackson): Add categorized stats.
550
554
},
555
+ fileCache : d .fileCache ,
551
556
}
552
- checkConfig .blobValueFetcher .Init (& readState .current .BlobFiles , d .fileCache , checkConfig .readEnv )
557
+ checkConfig .blobValueFetcher .Init (& readState .current .BlobFiles , checkConfig .fileCache , checkConfig .readEnv )
553
558
defer func () { _ = checkConfig .blobValueFetcher .Close () }()
554
559
return checkLevelsInternal (checkConfig )
555
560
}
@@ -608,6 +613,8 @@ func checkLevelsInternal(c *checkConfig) (err error) {
608
613
mlevels = append (mlevels , simpleMergingIterLevel {})
609
614
}
610
615
mlevelAlloc := mlevels [start :]
616
+ var allTables []* manifest.TableMetadata
617
+
611
618
// Add L0 files by sublevel.
612
619
for sublevel := len (current .L0SublevelFiles ) - 1 ; sublevel >= 0 ; sublevel -- {
613
620
if current .L0SublevelFiles [sublevel ].Empty () {
@@ -621,19 +628,24 @@ func checkLevelsInternal(c *checkConfig) (err error) {
621
628
li .initRangeDel (& mlevelAlloc [0 ])
622
629
mlevelAlloc [0 ].iter = li
623
630
mlevelAlloc = mlevelAlloc [1 :]
631
+ for f := range current .L0SublevelFiles [sublevel ].All () {
632
+ allTables = append (allTables , f )
633
+ }
624
634
}
625
635
for level := 1 ; level < len (current .Levels ); level ++ {
626
636
if current .Levels [level ].Empty () {
627
637
continue
628
638
}
629
-
630
639
iterOpts := IterOptions {logger : c .logger }
631
640
li := & levelIter {}
632
641
li .init (context .Background (), iterOpts , c .comparer , c .newIters ,
633
642
current .Levels [level ].Iter (), manifest .Level (level ), internalOpts )
634
643
li .initRangeDel (& mlevelAlloc [0 ])
635
644
mlevelAlloc [0 ].iter = li
636
645
mlevelAlloc = mlevelAlloc [1 :]
646
+ for f := range current .Levels [level ].All () {
647
+ allTables = append (allTables , f )
648
+ }
637
649
}
638
650
639
651
mergingIter := & simpleMergingIter {}
@@ -648,7 +660,150 @@ func checkLevelsInternal(c *checkConfig) (err error) {
648
660
}
649
661
650
662
// Phase 2: Check that the tombstones are mutually consistent.
651
- return checkRangeTombstones (c )
663
+ if err := checkRangeTombstones (c ); err != nil {
664
+ return err
665
+ }
666
+
667
+ // Phase 3: Validate blob value liveness block for all tables in the LSM.
668
+ // TODO(annie): This is a very expensive operation. We should try to reduce
669
+ // the amount of work performed. One possibility is to have the caller
670
+ // pass in a prng seed and use that to choose which tables to validate.
671
+ if err := validateBlobValueLiveness (allTables , c .fileCache , c .readEnv , & c .blobValueFetcher ); err != nil {
672
+ return err
673
+ }
674
+
675
+ return nil
676
+ }
677
+
678
+ type valuesInfo struct {
679
+ valueIDs []int
680
+ totalSize int
681
+ }
682
+
683
+ // gatherBlobHandles gathers all the blob handles in an sstable, returning a
684
+ // slice of maps; indexing into the slice at `i` is equivalent to retrieving
685
+ // each blob.BlockID's referenced blob.BlockValueID for the `i`th blob reference.
686
+ func gatherBlobHandles (
687
+ ctx context.Context ,
688
+ r * sstable.Reader ,
689
+ blobRefs manifest.BlobReferences ,
690
+ valueFetcher base.ValueFetcher ,
691
+ ) ([]map [blob.BlockID ]valuesInfo , error ) {
692
+ iter , err := r .NewPointIter (ctx , sstable.IterOptions {
693
+ BlobContext : sstable.TableBlobContext {
694
+ ValueFetcher : valueFetcher ,
695
+ References : & blobRefs ,
696
+ },
697
+ })
698
+ if err != nil {
699
+ return nil , err
700
+ }
701
+ defer func () { _ = iter .Close () }()
702
+
703
+ referenced := make ([]map [blob.BlockID ]valuesInfo , len (blobRefs ))
704
+ for i := range referenced {
705
+ referenced [i ] = make (map [blob.BlockID ]valuesInfo )
706
+ }
707
+ for kv := iter .First (); kv != nil ; kv = iter .Next () {
708
+ if kv .V .IsBlobValueHandle () {
709
+ lv := kv .V .LazyValue ()
710
+ handleSuffix := blob .DecodeHandleSuffix (lv .ValueOrHandle )
711
+ refID , ok := blobRefs .IDByBlobFileID (lv .Fetcher .BlobFileID )
712
+ if ! ok {
713
+ return nil , errors .Errorf ("blob file ID %d not found in blob references" , lv .Fetcher .BlobFileID )
714
+ }
715
+ blockID := handleSuffix .BlockID
716
+ valueID := int (handleSuffix .ValueID )
717
+ vi := referenced [refID ][blockID ]
718
+ vi .valueIDs = append (vi .valueIDs , valueID )
719
+ vi .totalSize += lv .Len ()
720
+ referenced [refID ][blockID ] = vi
721
+ }
722
+ }
723
+ return referenced , nil
724
+ }
725
+
726
+ func performValidationForSSTable (
727
+ decoder colblk.ReferenceLivenessBlockDecoder ,
728
+ tableNum base.TableNum ,
729
+ referenced []map [blob.BlockID ]valuesInfo ,
730
+ ) error {
731
+ if len (referenced ) != decoder .BlockDecoder ().Rows () {
732
+ return errors .Errorf ("mismatch in number of references in blob value " +
733
+ "liveness block: expected=%d found=%d" , len (referenced ),
734
+ decoder .BlockDecoder ().Rows ())
735
+ }
736
+ for refID , blockValues := range referenced {
737
+ bitmapEncodings := slices .Clone (decoder .LivenessAtReference (refID ))
738
+ for _ , blockEnc := range sstable .DecodeBlobRefLivenessEncoding (bitmapEncodings ) {
739
+ blockID := blockEnc .BlockID
740
+ vi , ok := blockValues [blockID ]
741
+ if ! ok {
742
+ return errors .Errorf ("dangling refID=%d blockID=%d in blob value " +
743
+ "liveness encoding for sstable %d" , refID , blockID , tableNum )
744
+ }
745
+ encodedVals := slices .Collect (sstable .IterSetBitsInRunLengthBitmap (blockEnc .Bitmap ))
746
+ if ! slices .Equal (vi .valueIDs , encodedVals ) {
747
+ return errors .Errorf ("bitmap mismatch for refID=%d blockID=%d: " +
748
+ "expected=%v encoded=%v for sstable %d" , refID , blockID , vi .valueIDs ,
749
+ encodedVals , tableNum )
750
+ }
751
+ if vi .totalSize != blockEnc .ValuesSize {
752
+ return errors .Errorf ("value size mismatch for refID=%d blockID=%d: " +
753
+ "expected=%d encoded=%d for sstable %d" , refID , blockID , vi .totalSize ,
754
+ blockEnc .ValuesSize , tableNum )
755
+ }
756
+ // Remove the processed blockID from the map so that later,
757
+ // we can check if we processed everything. This is to
758
+ // ensure that we do not have any missing references in the
759
+ // blob reference liveness block for any of the references
760
+ // in the sstable.
761
+ delete (blockValues , blockID )
762
+ }
763
+ if len (blockValues ) > 0 {
764
+ return errors .Errorf ("refID=%d blockIDs=%v referenced by sstable %d " +
765
+ "is/are not present in blob reference liveness block" , refID ,
766
+ slices .Collect (maps .Keys (blockValues )), tableNum )
767
+ }
768
+ }
769
+ return nil
770
+ }
771
+
772
+ // validateBlobValueLiveness iterates through each table,
773
+ // gathering all the blob handles, and then compares the values encoded in the
774
+ // blob reference liveness block to the values referenced by the blob handles.
775
+ func validateBlobValueLiveness (
776
+ tables []* manifest.TableMetadata ,
777
+ fc * fileCacheHandle ,
778
+ readEnv block.ReadEnv ,
779
+ valueFetcher base.ValueFetcher ,
780
+ ) error {
781
+ ctx := context .TODO ()
782
+ var decoder colblk.ReferenceLivenessBlockDecoder
783
+ for _ , t := range tables {
784
+ if len (t .BlobReferences ) == 0 {
785
+ continue
786
+ }
787
+ if err := fc .withReader (ctx , readEnv , t , func (r * sstable.Reader , readEnv sstable.ReadEnv ) error {
788
+ // For this sstable, gather all the blob handles -- tracking
789
+ // each blob.ReferenceID + blob.BlockID's referenced
790
+ // blob.BlockValueIDs.
791
+ referenced , err := gatherBlobHandles (ctx , r , t .BlobReferences , valueFetcher )
792
+ if err != nil {
793
+ return err
794
+ }
795
+ h , err := r .ReadBlobRefIndexBlock (ctx , readEnv .Block )
796
+ if err != nil {
797
+ return err
798
+ }
799
+ defer h .Release ()
800
+ decoder .Init (h .BlockData ())
801
+ return performValidationForSSTable (decoder , t .TableNum , referenced )
802
+ }); err != nil {
803
+ return err
804
+ }
805
+ }
806
+ return nil
652
807
}
653
808
654
809
type simpleMergingIterItem struct {
0 commit comments