-
Notifications
You must be signed in to change notification settings - Fork 19.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
core/rawdb: freezer index repair #29792
base: master
Are you sure you want to change the base?
Changes from all commits
b92db0b
c8b8876
1bc011c
92ea3f7
8381d61
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,6 +17,7 @@ | |
package rawdb | ||
|
||
import ( | ||
"bufio" | ||
"bytes" | ||
"encoding/binary" | ||
"errors" | ||
|
@@ -26,6 +27,7 @@ import ( | |
"path/filepath" | ||
"sync" | ||
"sync/atomic" | ||
"time" | ||
|
||
"github.com/ethereum/go-ethereum/common" | ||
"github.com/ethereum/go-ethereum/log" | ||
|
@@ -219,7 +221,13 @@ func (t *freezerTable) repair() error { | |
return err | ||
} // New file can't trigger this path | ||
} | ||
// Retrieve the file sizes and prepare for truncation | ||
// Validate the index file as it might contain some garbage data after the | ||
// power failures. | ||
if err := t.repairIndex(); err != nil { | ||
return err | ||
} | ||
// Retrieve the file sizes and prepare for truncation. Note the file size | ||
// might be changed after index validation. | ||
if stat, err = t.index.Stat(); err != nil { | ||
return err | ||
} | ||
|
@@ -364,6 +372,126 @@ func (t *freezerTable) repair() error { | |
return nil | ||
} | ||
|
||
// repairIndex validates the integrity of the index file. According to the design, | ||
// the initial entry in the file denotes the earliest data file along with the | ||
// count of deleted items. Following this, all subsequent entries in the file must | ||
// be in order. This function identifies any corrupted entries and truncates items | ||
// occurring after the corruption point. | ||
// | ||
// corruption can occur because of the power failure. In the Linux kernel, the | ||
// file metadata update and data update are not necessarily performed at the | ||
// same time. Typically, the metadata will be flushed/journalled ahead of the file | ||
// data. Therefore, we make the pessimistic assumption that the file is first | ||
// extended with invalid "garbage" data (normally zero bytes) and that afterwards | ||
// the correct data replaces the garbage. As all the items in index file are | ||
// supposed to be in-order, the leftover garbage must be truncated before the | ||
// index data is utilized. | ||
// | ||
// It's important to note an exception that's unfortunately undetectable: when | ||
// all index entries in the file are zero. Distinguishing whether they represent | ||
// leftover garbage or if all items in the table have zero size is impossible. | ||
// In such instances, the file will remain unchanged to prevent potential data | ||
// loss or misinterpretation. | ||
func (t *freezerTable) repairIndex() error { | ||
// Retrieve the file sizes and prepare for validation | ||
stat, err := t.index.Stat() | ||
if err != nil { | ||
return err | ||
} | ||
size := stat.Size() | ||
|
||
// Move the read cursor to the beginning of the file | ||
_, err = t.index.Seek(0, io.SeekStart) | ||
if err != nil { | ||
return err | ||
} | ||
fr := bufio.NewReader(t.index) | ||
|
||
var ( | ||
start = time.Now() | ||
buff = make([]byte, indexEntrySize) | ||
prev indexEntry | ||
head indexEntry | ||
|
||
read = func() (indexEntry, error) { | ||
n, err := io.ReadFull(fr, buff) | ||
if err != nil { | ||
return indexEntry{}, err | ||
} | ||
if n != indexEntrySize { | ||
return indexEntry{}, fmt.Errorf("failed to read from index, n: %d", n) | ||
} | ||
var entry indexEntry | ||
entry.unmarshalBinary(buff) | ||
return entry, nil | ||
} | ||
truncate = func(offset int64) error { | ||
if t.readonly { | ||
return fmt.Errorf("index file is corrupted at %d, size: %d", offset, size) | ||
} | ||
if err := truncateFreezerFile(t.index, offset); err != nil { | ||
return err | ||
} | ||
log.Warn("Truncated index file", "offset", offset, "truncated", size-offset) | ||
return nil | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These functions could just be methods on freezerTable |
||
) | ||
for offset := int64(0); offset < size; offset += indexEntrySize { | ||
entry, err := read() | ||
if err != nil { | ||
return err | ||
} | ||
if offset == 0 { | ||
head = entry | ||
continue | ||
} | ||
// Ensure that the first non-head index refers to the earliest file, | ||
// or the next file if the earliest file is not sufficient to | ||
// place the first item. | ||
if offset == indexEntrySize { | ||
if entry.filenum != head.filenum && entry.filenum != head.filenum+1 { | ||
log.Error("Corrupted index item detected", "earliest", head.filenum, "filenumber", entry.filenum) | ||
return truncate(offset) | ||
} | ||
prev = entry | ||
continue | ||
} | ||
// ensure two consecutive index items are in order | ||
if err := t.checkIndexItems(prev, entry); err != nil { | ||
log.Error("Corrupted index item detected", "err", err) | ||
return truncate(offset) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should log the error here. Maybe pass the error into |
||
} | ||
prev = entry | ||
} | ||
// Move the read cursor to the end of the file. While theoretically, the | ||
// cursor should reach the end by reading all the items in the file, perform | ||
// the seek operation anyway as a precaution. | ||
_, err = t.index.Seek(0, io.SeekEnd) | ||
if err != nil { | ||
return err | ||
} | ||
log.Debug("Verified index file", "items", size/indexEntrySize, "elapsed", common.PrettyDuration(time.Since(start))) | ||
return nil | ||
} | ||
|
||
// checkIndexItems checks the validity of two consecutive index items. The index | ||
// item is regarded as invalid if: | ||
// - file number of two index items are not same and not monotonically increasing | ||
// - data offset of two index items with same file number are out of order | ||
// - zero data offset with an increasing file number | ||
func (t *freezerTable) checkIndexItems(a, b indexEntry) error { | ||
if b.filenum != a.filenum && b.filenum != a.filenum+1 { | ||
return fmt.Errorf("index items with inconsistent file number, prev: %d, next: %d", a.filenum, b.filenum) | ||
} | ||
if b.filenum == a.filenum && b.offset < a.offset { | ||
return fmt.Errorf("index items with unordered offset, prev: %d, next: %d", a.offset, b.offset) | ||
} | ||
if b.filenum == a.filenum+1 && b.offset == 0 { | ||
return fmt.Errorf("index items with zero offset, file number: %d", b.filenum) | ||
} | ||
return nil | ||
} | ||
|
||
// preopen opens all files that the freezer will need. This method should be called from an init-context, | ||
// since it assumes that it doesn't have to bother with locking | ||
// The rationale for doing preopen is to not have to do it from within Retrieve, thus not needing to ever | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could we remove the sync call above as well, if we extend/modify the repair?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unfortunately, we have no mechanism to detect the whether the data file (batch.t.head) contains garbage or not after the power failure. It's the reason I leave the sync operation for data file.
However, I think we can do the background file sync for data file, with a specific time interval. For example, mongo uses this strategy by flushing the file every 100 milliseconds