Skip to content

Commit

Permalink
storage/engine,libroach: 1x write amplification on ingest sst
Browse files Browse the repository at this point in the history
This uses he new flag added in
facebook/rocksdb#4172 to avoid the global_seq_no
edits and the SST copying they forced us to do to preserve the raft log.

Passing this flag has to be gated on a cluster version that is defined
well after the upgrade to rocks 5.17 -- using the flag requires that
older versions of RocksDB (< 5.16) will never be used to read, as they
will still be looking for and using the seq_no in the file.

Release note (performance improvement): reduce write-amplification during bulk-loading (IMPORT and RESTORE)
  • Loading branch information
dt committed Feb 22, 2019
1 parent 00670e0 commit 4ffed77
Show file tree
Hide file tree
Showing 8 changed files with 45 additions and 24 deletions.
20 changes: 15 additions & 5 deletions c-deps/libroach/db.cc
Original file line number Diff line number Diff line change
Expand Up @@ -675,7 +675,7 @@ DBStatus DBGetSortedWALFiles(DBEngine* db, DBWALFile** files, int* n) {
DBString DBGetUserProperties(DBEngine* db) { return db->GetUserProperties(); }

DBStatus DBIngestExternalFiles(DBEngine* db, char** paths, size_t len, bool move_files,
bool allow_file_modifications) {
bool write_global_seqno, bool allow_file_modifications) {
std::vector<std::string> paths_vec;
for (size_t i = 0; i < len; i++) {
paths_vec.push_back(paths[i]);
Expand All @@ -691,10 +691,20 @@ DBStatus DBIngestExternalFiles(DBEngine* db, char** paths, size_t len, bool move
ingest_options.snapshot_consistency = true;
// If a file is ingested over existing data (including the range tombstones
// used by range snapshots) or if a RocksDB snapshot is outstanding when this
// ingest runs, then after moving/copying the file, RocksDB will edit it
// (overwrite some of the bytes) to have a global sequence number. If this is
// false, it will error in these cases instead.
ingest_options.allow_global_seqno = allow_file_modifications;
// ingest runs, then after moving/copying the file, historically RocksDB would
// edit it (overwrite some of the bytes) to have a global sequence number.
// After https://github.com/facebook/rocksdb/pull/4172 this can be disabled
// (with the mutable manifest/metadata tracking that instead). However it is
// only safe to disable the seqno write if older versions of RocksDB (<5.16)
// will not be used to read these SSTs.
//
// RocksDB checks the option allow_global_seqno and, if it is false, returns
// an error instead of ingesting a file that would require one. However it
// does this check *even if it is not planning on writing seqno* at all, so we
// need to set allow_global_seqno to true for !write_global_seqno to have the
// desired effect.
ingest_options.write_global_seqno = write_global_seqno;
ingest_options.allow_global_seqno = allow_file_modifications || !write_global_seqno;
// If there are mutations in the memtable for the keyrange covered by the file
// being ingested, this option is checked. If true, the memtable is flushed
// and the ingest run. If false, an error is returned.
Expand Down
7 changes: 5 additions & 2 deletions c-deps/libroach/include/libroach.h
Original file line number Diff line number Diff line change
Expand Up @@ -408,9 +408,12 @@ DBString DBGetUserProperties(DBEngine* db);
// what can be added. If move_files is true, the files will be moved instead of
// copied. If allow_file_modifications is false, RocksDB will return an error if
// it would have tried to modify any of the files' sequence numbers rather than
// editing the files in place.
// editing the files in place. If write_global_seqno is false, it will skip
// writing the seq_no to the SSTs -- this is only safe if this will only ever be
// read by Rocks version >= 5.16 as older versions would still be looking for
// the seqno.
DBStatus DBIngestExternalFiles(DBEngine* db, char** paths, size_t len, bool move_files,
bool allow_file_modifications);
bool write_global_seqno, bool allow_file_modifications);

typedef struct DBSstFileWriter DBSstFileWriter;

Expand Down
2 changes: 1 addition & 1 deletion pkg/ccl/storageccl/engineccl/mvcc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,7 @@ func TestMVCCIncrementalIteratorIntentStraddlesSStables(t *testing.T) {
if err := db2.WriteFile(`ingest`, sstContents); err != nil {
t.Fatal(err)
}
if err := db2.IngestExternalFiles(ctx, []string{`ingest`}, true); err != nil {
if err := db2.IngestExternalFiles(ctx, []string{`ingest`}, true, true); err != nil {
t.Fatal(err)
}
}
Expand Down
6 changes: 3 additions & 3 deletions pkg/storage/batcheval/cmd_add_sstable_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -216,8 +216,8 @@ func runTestDBAddSSTable(ctx context.Context, t *testing.T, db *client.DB, store
// The second time though we had to make a copy of the SST since rocks saw
// existing data (from the first time), and rejected the no-modification
// attempt.
if after := metrics.AddSSTableApplicationCopies.Count(); before >= after {
t.Fatalf("expected sst copies to increase, %d before %d after", before, after)
if after := metrics.AddSSTableApplicationCopies.Count(); before != after {
t.Fatalf("expected sst copies not to increase, %d before %d after", before, after)
}
}
}
Expand Down Expand Up @@ -373,7 +373,7 @@ func TestAddSSTableMVCCStats(t *testing.T) {
if err := e.WriteFile(filename, sstBytes); err != nil {
t.Fatalf("%+v", err)
}
if err := e.IngestExternalFiles(ctx, []string{filename}, true /* modify the sst */); err != nil {
if err := e.IngestExternalFiles(ctx, []string{filename}, true /* skip writing global seqno */, true /* modify the sst */); err != nil {
t.Fatalf("%+v", err)
}

Expand Down
11 changes: 6 additions & 5 deletions pkg/storage/engine/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -298,11 +298,12 @@ type Engine interface {
// original engine has been stopped.
NewSnapshot() Reader
// IngestExternalFiles atomically links a slice of files into the RocksDB
// log-structured merge-tree. May modify the files (including the underlying
// file in the case of hard-links) if allowFileModifications is passed as
// well. See additional comments in db.cc's IngestExternalFile explaining
// modification behavior.
IngestExternalFiles(ctx context.Context, paths []string, allowFileModifications bool) error
// log-structured merge-tree. skipWritingSeqNo = true may be passed iff this
// rocksdb will never be read by versions prior to 5.16. Otherwise, if it is
// false, ingestion may modify the files (including the underlying file in the
// case of hard-links) when allowFileModifications true. See additional
// comments in db.cc's IngestExternalFile explaining modification behavior.
IngestExternalFiles(ctx context.Context, paths []string, skipWritingSeqNo, allowFileModifications bool) error
// ApproximateDiskBytes returns an approximation of the on-disk size for the given key span.
ApproximateDiskBytes(from, to roachpb.Key) (uint64, error)
// CompactRange ensures that the specified range of key value pairs is
Expand Down
7 changes: 4 additions & 3 deletions pkg/storage/engine/rocksdb.go
Original file line number Diff line number Diff line change
Expand Up @@ -2768,9 +2768,9 @@ func (fr *RocksDBSstFileReader) IngestExternalFile(data []byte) error {
cPathLen := C.size_t(len(cPaths))
defer C.free(unsafe.Pointer(cPaths[0]))

const noMove, modify = false, true
const noMove, writeSeqNo, modify = false, false, true
return statusToError(C.DBIngestExternalFiles(
fr.rocksDB.rdb, &cPaths[0], cPathLen, noMove, modify,
fr.rocksDB.rdb, &cPaths[0], cPathLen, noMove, writeSeqNo, modify,
))
}

Expand Down Expand Up @@ -2912,7 +2912,7 @@ func (r *RocksDB) setAuxiliaryDir(d string) error {
// IngestExternalFiles atomically links a slice of files into the RocksDB
// log-structured merge-tree.
func (r *RocksDB) IngestExternalFiles(
ctx context.Context, paths []string, allowFileModifications bool,
ctx context.Context, paths []string, skipWritingSeqNo, allowFileModifications bool,
) error {
cPaths := make([]*C.char, len(paths))
for i := range paths {
Expand All @@ -2929,6 +2929,7 @@ func (r *RocksDB) IngestExternalFiles(
&cPaths[0],
C.size_t(len(cPaths)),
C._Bool(true), // move_files
C._Bool(!skipWritingSeqNo),
C._Bool(allowFileModifications),
))
}
Expand Down
6 changes: 3 additions & 3 deletions pkg/storage/engine/rocksdb_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1360,7 +1360,7 @@ func TestRocksDBDeleteRangeCompaction(t *testing.T) {
t.Fatal(err)
}

if err := db.IngestExternalFiles(context.Background(), []string{filename}, true); err != nil {
if err := db.IngestExternalFiles(context.Background(), []string{filename}, true, true); err != nil {
t.Fatal(err)
}
if testing.Verbose() {
Expand Down Expand Up @@ -1487,7 +1487,7 @@ func BenchmarkRocksDBDeleteRangeIterate(b *testing.B) {
b.Fatal(err)
}

err = db.IngestExternalFiles(context.Background(), []string{filename}, true)
err = db.IngestExternalFiles(context.Background(), []string{filename}, true, true)
if err != nil {
b.Fatal(err)
}
Expand Down Expand Up @@ -1604,7 +1604,7 @@ func TestSstFileWriterTimeBound(t *testing.T) {
if err := db.WriteFile(`ingest`, sstContents); err != nil {
t.Fatal(err)
}
if err := db.IngestExternalFiles(ctx, []string{`ingest`}, true); err != nil {
if err := db.IngestExternalFiles(ctx, []string{`ingest`}, true, true); err != nil {
t.Fatal(err)
}
}
Expand Down
10 changes: 8 additions & 2 deletions pkg/storage/replica_proposal.go
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,12 @@ func addSSTablePreApply(
log.Fatalf(ctx, "sideloaded SSTable at term %d, index %d is missing", term, index)
}

// as of VersionUnreplicatedRaftTruncatedState we were on rocksdb 5.17 so this
// cluster version should indicate that we will never use rocksdb < 5.16 to
// read these SSTs, so it is safe to use https://github.com/facebook/rocksdb/pull/4172
// to avoid needing the global seq_no edits and the copies they required.
canSkipSeqNo := st.Version.IsActive(cluster.VersionUnreplicatedRaftTruncatedState)

copied := false
if inmem, ok := eng.(engine.InMem); ok {
path = fmt.Sprintf("%x", checksum)
Expand All @@ -416,7 +422,7 @@ func addSSTablePreApply(
// pass it the path in the sideload store as it deletes the passed path on
// success.
if linkErr := eng.LinkFile(path, ingestPath); linkErr == nil {
ingestErr := eng.IngestExternalFiles(ctx, []string{ingestPath}, noModify)
ingestErr := eng.IngestExternalFiles(ctx, []string{ingestPath}, canSkipSeqNo, noModify)
if ingestErr == nil {
// Adding without modification succeeded, no copy necessary.
log.Eventf(ctx, "ingested SSTable at index %d, term %d: %s", index, term, ingestPath)
Expand Down Expand Up @@ -457,7 +463,7 @@ func addSSTablePreApply(
copied = true
}

if err := eng.IngestExternalFiles(ctx, []string{path}, modify); err != nil {
if err := eng.IngestExternalFiles(ctx, []string{path}, canSkipSeqNo, modify); err != nil {
log.Fatalf(ctx, "while ingesting %s: %s", path, err)
}
log.Eventf(ctx, "ingested SSTable at index %d, term %d: %s", index, term, path)
Expand Down

0 comments on commit 4ffed77

Please sign in to comment.