Skip to content

Commit a45eb82

Browse files
committed
sstable: do a bit flip computation on a checksum mismatch in the Reader
When a checksum mismatch occurs, this update tries flipping each bit of the data to see if a single-bit error caused the issue. If found, the error message includes the bit and index details. Fixes: #2571
1 parent 8701659 commit a45eb82

File tree

2 files changed

+162
-74
lines changed

2 files changed

+162
-74
lines changed

sstable/block/block.go

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,10 @@ package block
77
import (
88
"context"
99
"encoding/binary"
10+
"fmt"
1011
"path/filepath"
1112
"runtime"
13+
"slices"
1214
"time"
1315

1416
"github.com/cespare/xxhash/v2"
@@ -171,13 +173,55 @@ func ValidateChecksum(checksumType ChecksumType, b []byte, bh Handle) error {
171173
return errors.Errorf("unsupported checksum type: %d", checksumType)
172174
}
173175
if expectedChecksum != computedChecksum {
174-
return base.CorruptionErrorf("block %d/%d: %s checksum mismatch %x != %x",
176+
// Check if the checksum was due to a singular bit flip and report it.
177+
data := slices.Clone(b[:bh.Length+1])
178+
found, indexFound, bitFound := checkSliceForBitFlip(data, checksumType, expectedChecksum)
179+
bitFlipExtraMsg := ""
180+
if found {
181+
bitFlipExtraMsg = fmt.Sprintf(". bit flip found: byte index %d. got: %x. want: %x.",
182+
indexFound, data[indexFound], data[indexFound]^(1<<bitFound))
183+
}
184+
return base.CorruptionErrorf("block %d/%d: %s checksum mismatch %x != %x%s",
175185
errors.Safe(bh.Offset), errors.Safe(bh.Length), checksumType,
176-
expectedChecksum, computedChecksum)
186+
expectedChecksum, computedChecksum, bitFlipExtraMsg)
177187
}
178188
return nil
179189
}
180190

191+
func checkSliceForBitFlip(
192+
data []byte, checksumType ChecksumType, expectedChecksum uint32,
193+
) (found bool, indexFound int, bitFound int) {
194+
// TODO(edward) This checking process likely can be made faster.
195+
iterationLimit := 40 * (1 << 10) // 40KB
196+
for i := 0; i < min(len(data), iterationLimit); i++ {
197+
foundFlip, bit := checkByteForFlip(data, i, checksumType, expectedChecksum)
198+
if foundFlip {
199+
return true, i, bit
200+
}
201+
}
202+
return false, 0, 0
203+
}
204+
205+
func checkByteForFlip(
206+
data []byte, i int, checksumType ChecksumType, expectedChecksum uint32,
207+
) (found bool, bit int) {
208+
for bit := 0; bit < 8; bit++ {
209+
data[i] ^= (1 << bit)
210+
var computedChecksum uint32
211+
switch checksumType {
212+
case ChecksumTypeCRC32c:
213+
computedChecksum = crc.New(data).Value()
214+
case ChecksumTypeXXHash64:
215+
computedChecksum = uint32(xxhash.Sum64(data))
216+
}
217+
data[i] ^= (1 << bit)
218+
if computedChecksum == expectedChecksum {
219+
return true, bit
220+
}
221+
}
222+
return false, 0
223+
}
224+
181225
// Metadata is an in-memory buffer that stores metadata for a block. It is
182226
// allocated together with the buffer storing the block and is initialized once
183227
// when the block is read from disk.

sstable/reader_test.go

Lines changed: 116 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -1418,85 +1418,129 @@ func TestReaderChecksumErrors(t *testing.T) {
14181418
t.Run(fmt.Sprintf("checksum-type=%d", checksumType), func(t *testing.T) {
14191419
for _, twoLevelIndex := range []bool{false, true} {
14201420
t.Run(fmt.Sprintf("two-level-index=%t", twoLevelIndex), func(t *testing.T) {
1421-
mem := vfs.NewMem()
1421+
for _, corruptionType := range []string{"first-byte", "random-bit"} {
1422+
t.Run(fmt.Sprintf("corruption-type=%s", corruptionType), func(t *testing.T) {
1423+
mem := vfs.NewMem()
1424+
1425+
{
1426+
// Create an sstable with 3 data blocks.
1427+
f, err := mem.Create("test", vfs.WriteCategoryUnspecified)
1428+
require.NoError(t, err)
1429+
1430+
const blockSize = 32
1431+
indexBlockSize := 4096
1432+
if twoLevelIndex {
1433+
indexBlockSize = 1
1434+
}
14221435

1423-
{
1424-
// Create an sstable with 3 data blocks.
1425-
f, err := mem.Create("test", vfs.WriteCategoryUnspecified)
1426-
require.NoError(t, err)
1436+
w := NewWriter(objstorageprovider.NewFileWritable(f), WriterOptions{
1437+
BlockSize: blockSize,
1438+
IndexBlockSize: indexBlockSize,
1439+
Checksum: checksumType,
1440+
})
1441+
require.NoError(t, w.Set(bytes.Repeat([]byte("a"), blockSize), nil))
1442+
require.NoError(t, w.Set(bytes.Repeat([]byte("b"), blockSize), nil))
1443+
require.NoError(t, w.Set(bytes.Repeat([]byte("c"), blockSize), nil))
1444+
require.NoError(t, w.Close())
1445+
}
14271446

1428-
const blockSize = 32
1429-
indexBlockSize := 4096
1430-
if twoLevelIndex {
1431-
indexBlockSize = 1
1432-
}
1447+
// Load the layout so that we know the location of the data blocks.
1448+
var layout *Layout
1449+
{
1450+
f, err := mem.Open("test")
1451+
require.NoError(t, err)
1452+
1453+
r, err := newReader(f, ReaderOptions{})
1454+
require.NoError(t, err)
1455+
layout, err = r.Layout()
1456+
require.NoError(t, err)
1457+
require.EqualValues(t, len(layout.Data), 3)
1458+
require.NoError(t, r.Close())
1459+
}
14331460

1434-
w := NewWriter(objstorageprovider.NewFileWritable(f), WriterOptions{
1435-
BlockSize: blockSize,
1436-
IndexBlockSize: indexBlockSize,
1437-
Checksum: checksumType,
1438-
})
1439-
require.NoError(t, w.Set(bytes.Repeat([]byte("a"), blockSize), nil))
1440-
require.NoError(t, w.Set(bytes.Repeat([]byte("b"), blockSize), nil))
1441-
require.NoError(t, w.Set(bytes.Repeat([]byte("c"), blockSize), nil))
1442-
require.NoError(t, w.Close())
1443-
}
1461+
for _, bh := range layout.Data {
14441462

1445-
// Load the layout so that we no the location of the data blocks.
1446-
var layout *Layout
1447-
{
1448-
f, err := mem.Open("test")
1449-
require.NoError(t, err)
1450-
1451-
r, err := newReader(f, ReaderOptions{})
1452-
require.NoError(t, err)
1453-
layout, err = r.Layout()
1454-
require.NoError(t, err)
1455-
require.EqualValues(t, len(layout.Data), 3)
1456-
require.NoError(t, r.Close())
1457-
}
1463+
// Read the sstable and corrupt the first byte or a random bit in
1464+
// the target data block.
1465+
orig, err := mem.Open("test")
1466+
require.NoError(t, err)
1467+
data, err := io.ReadAll(orig)
1468+
require.NoError(t, err)
1469+
require.NoError(t, orig.Close())
14581470

1459-
for _, bh := range layout.Data {
1460-
// Read the sstable and corrupt the first byte in the target data
1461-
// block.
1462-
orig, err := mem.Open("test")
1463-
require.NoError(t, err)
1464-
data, err := io.ReadAll(orig)
1465-
require.NoError(t, err)
1466-
require.NoError(t, orig.Close())
1467-
1468-
// Corrupt the first byte in the block.
1469-
data[bh.Offset] ^= 0xff
1470-
1471-
corrupted, err := mem.Create("corrupted", vfs.WriteCategoryUnspecified)
1472-
require.NoError(t, err)
1473-
_, err = corrupted.Write(data)
1474-
require.NoError(t, err)
1475-
require.NoError(t, corrupted.Close())
1476-
1477-
// Verify that we encounter a checksum mismatch error while iterating
1478-
// over the sstable.
1479-
corrupted, err = mem.Open("corrupted")
1480-
require.NoError(t, err)
1481-
1482-
r, err := newReader(corrupted, ReaderOptions{})
1483-
require.NoError(t, err)
1484-
1485-
iter, err := r.NewIter(NoTransforms, nil, nil)
1486-
require.NoError(t, err)
1487-
for kv := iter.First(); kv != nil; kv = iter.Next() {
1488-
}
1489-
require.Regexp(t, `checksum mismatch`, iter.Error())
1490-
require.Regexp(t, `checksum mismatch`, iter.Close())
1471+
if corruptionType == "first-byte" {
1472+
data[bh.Offset] ^= 0xff
1473+
} else {
1474+
// Corrupt a random bit in the block.
1475+
r := rand.New(rand.NewPCG(uint64(time.Now().UnixNano()), 0))
1476+
randOffset := r.Uint64N(bh.Length) + bh.Offset
1477+
println(bh.Offset, bh.Length)
1478+
randBit := uint(r.IntN(8))
1479+
data[randOffset] ^= (1 << randBit)
1480+
}
14911481

1492-
iter, err = r.NewIter(NoTransforms, nil, nil)
1493-
require.NoError(t, err)
1494-
for kv := iter.Last(); kv != nil; kv = iter.Prev() {
1495-
}
1496-
require.Regexp(t, `checksum mismatch`, iter.Error())
1497-
require.Regexp(t, `checksum mismatch`, iter.Close())
1482+
corrupted, err := mem.Create("corrupted", vfs.WriteCategoryUnspecified)
1483+
require.NoError(t, err)
1484+
_, err = corrupted.Write(data)
1485+
require.NoError(t, err)
1486+
require.NoError(t, corrupted.Close())
1487+
1488+
// Verify that we encounter a checksum mismatch error while iterating
1489+
// over the sstable.
1490+
corrupted, err = mem.Open("corrupted")
1491+
require.NoError(t, err)
14981492

1499-
require.NoError(t, r.Close())
1493+
r, err := newReader(corrupted, ReaderOptions{})
1494+
1495+
if corruptionType == "first-byte" {
1496+
require.NoError(t, err)
1497+
iter, err := r.NewIter(NoTransforms, nil, nil)
1498+
require.NoError(t, err)
1499+
for kv := iter.First(); kv != nil; kv = iter.Next() {
1500+
}
1501+
require.Regexp(t, `checksum mismatch`, iter.Error())
1502+
require.Regexp(t, `checksum mismatch`, iter.Close())
1503+
1504+
iter, err = r.NewIter(NoTransforms, nil, nil)
1505+
require.NoError(t, err)
1506+
for kv := iter.Last(); kv != nil; kv = iter.Prev() {
1507+
}
1508+
require.Regexp(t, `checksum mismatch`, iter.Error())
1509+
require.Regexp(t, `checksum mismatch`, iter.Close())
1510+
1511+
require.NoError(t, r.Close())
1512+
} else {
1513+
// Check that the error message has the bit flip message if there was an error.
1514+
checkBitFlipErr := func(err error) bool {
1515+
if err != nil {
1516+
require.Regexp(t, `checksum mismatch.+bit flip found:.+`, err)
1517+
return true
1518+
}
1519+
return false
1520+
}
1521+
if checkBitFlipErr(err) {
1522+
break
1523+
}
1524+
iter, err := r.NewIter(NoTransforms, nil, nil)
1525+
if checkBitFlipErr(err) {
1526+
break
1527+
}
1528+
for kv := iter.First(); kv != nil; kv = iter.Next() {
1529+
}
1530+
if checkBitFlipErr(iter.Error()) && checkBitFlipErr(iter.Close()) {
1531+
}
1532+
iter, err = r.NewIter(NoTransforms, nil, nil)
1533+
if checkBitFlipErr(err) {
1534+
break
1535+
}
1536+
for kv := iter.Last(); kv != nil; kv = iter.Prev() {
1537+
}
1538+
if checkBitFlipErr(iter.Error()) && checkBitFlipErr(iter.Close()) {
1539+
}
1540+
require.NoError(t, r.Close())
1541+
}
1542+
}
1543+
})
15001544
}
15011545
})
15021546
}

0 commit comments

Comments
 (0)