Skip to content

Commit

Permalink
Skip directory fsync for filesystem btrfs (#8903)
Browse files Browse the repository at this point in the history
Summary:
Directory fsync might be expensive on btrfs and it may not be needed.
Here are 4 directory fsync cases:
1. creating a new file: dir-fsync is not needed on btrfs, as long as the
   new file itself is synced.
2. renaming a file: dir-fsync is not needed if the renamed file is
   synced. So an API `FsyncAfterFileRename(filename, ...)` is provided
   to sync the file on btrfs. By default, it just calls dir-fsync.
3. deleting files: dir-fsync is forced by set
   `IOOptions.force_dir_fsync = true`
4. renaming multiple files (like backup and checkpoint): dir-fsync is
   forced, the same as above.

Pull Request resolved: #8903

Test Plan: run tests on btrfs and non btrfs

Reviewed By: ajkr

Differential Revision: D30885059

Pulled By: jay-zhuang

fbshipit-source-id: dd2730b31580b0bcaedffc318a762d7dbf25de4a
  • Loading branch information
jay-zhuang authored and facebook-github-bot committed Nov 3, 2021
1 parent 0817227 commit 2910264
Show file tree
Hide file tree
Showing 29 changed files with 440 additions and 54 deletions.
2 changes: 2 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
* Made FileSystem extend the Customizable class and added a CreateFromString method. Implementations need to be registered with the ObjectRegistry and to implement a Name() method in order to be created via this method.
* Clarified in API comments that RocksDB is not exception safe for callbacks and custom extensions. An exception propagating into RocksDB can lead to undefined behavior, including data loss, unreported corruption, deadlocks, and more.
* Marked `WriteBufferManager` as `final` because it is not intended for extension.
* Add API `FSDirectory::FsyncWithDirOptions()`, which provides extra information like directory fsync reason in `DirFsyncOptions`. File system like btrfs is using that to skip directory fsync for creating a new file, or when renaming a file, fsync the target file instead of the directory, which improves the `DB::Open()` speed by ~20%.
* `DB::Open()` is not going be blocked by obsolete file purge if `DBOptions::avoid_unnecessary_blocking_io` is set to true.

## 6.26.0 (2021-10-20)
### Bug Fixes
Expand Down
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -1926,6 +1926,9 @@ clipping_iterator_test: $(OBJ_DIR)/db/compaction/clipping_iterator_test.o $(TEST
ribbon_bench: $(OBJ_DIR)/microbench/ribbon_bench.o $(LIBRARY)
$(AM_LINK)

db_basic_bench: $(OBJ_DIR)/microbench/db_basic_bench.o $(LIBRARY)
$(AM_LINK)

cache_reservation_manager_test: $(OBJ_DIR)/cache/cache_reservation_manager_test.o $(TEST_LIBRARY) $(LIBRARY)
$(AM_LINK)
#-------------------------------------------------
Expand Down
11 changes: 8 additions & 3 deletions db/compaction/compaction_job.cc
Original file line number Diff line number Diff line change
Expand Up @@ -763,12 +763,16 @@ Status CompactionJob::Run() {
constexpr IODebugContext* dbg = nullptr;

if (output_directory_) {
io_s = output_directory_->Fsync(IOOptions(), dbg);
io_s = output_directory_->FsyncWithDirOptions(
IOOptions(), dbg,
DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
}

if (io_s.ok() && wrote_new_blob_files && blob_output_directory_ &&
blob_output_directory_ != output_directory_) {
io_s = blob_output_directory_->Fsync(IOOptions(), dbg);
io_s = blob_output_directory_->FsyncWithDirOptions(
IOOptions(), dbg,
DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
}
}
if (io_status_.ok()) {
Expand Down Expand Up @@ -2442,7 +2446,8 @@ Status CompactionServiceCompactionJob::Run() {
constexpr IODebugContext* dbg = nullptr;

if (output_directory_) {
io_s = output_directory_->Fsync(IOOptions(), dbg);
io_s = output_directory_->FsyncWithDirOptions(IOOptions(), dbg,
DirFsyncOptions());
}
}
if (io_status_.ok()) {
Expand Down
12 changes: 11 additions & 1 deletion db/db_impl/db_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1389,7 +1389,9 @@ Status DBImpl::SyncWAL() {
IOStatusCheck(io_s);
}
if (status.ok() && need_log_dir_sync) {
status = directories_.GetWalDir()->Fsync(IOOptions(), nullptr);
status = directories_.GetWalDir()->FsyncWithDirOptions(
IOOptions(), nullptr,
DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
}
TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:2");

Expand Down Expand Up @@ -4288,6 +4290,14 @@ Status DBImpl::RenameTempFileToOptionsFile(const std::string& file_name) {
if (s.ok()) {
// Retry if the file name happen to conflict with an existing one.
s = GetEnv()->RenameFile(file_name, options_file_name);
std::unique_ptr<FSDirectory> dir_obj;
if (s.ok()) {
s = fs_->NewDirectory(GetName(), IOOptions(), &dir_obj, nullptr);
}
if (s.ok()) {
s = dir_obj->FsyncWithDirOptions(IOOptions(), nullptr,
DirFsyncOptions(options_file_name));
}
}
if (s.ok()) {
InstrumentedMutexLock l(&mutex_);
Expand Down
3 changes: 3 additions & 0 deletions db/db_impl/db_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -974,6 +974,9 @@ class DBImpl : public DB {
// is only for the special test of CancelledCompactions
Status TEST_WaitForCompact(bool waitUnscheduled = false);

// Wait for any background purge
Status TEST_WaitForPurge();

// Get the background error status
Status TEST_GetBGError();

Expand Down
8 changes: 6 additions & 2 deletions db/db_impl/db_impl_compaction_flush.cc
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,9 @@ IOStatus DBImpl::SyncClosedLogs(JobContext* job_context) {
}
}
if (io_s.ok()) {
io_s = directories_.GetWalDir()->Fsync(IOOptions(), nullptr);
io_s = directories_.GetWalDir()->FsyncWithDirOptions(
IOOptions(), nullptr,
DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
}

mutex_.Lock();
Expand Down Expand Up @@ -532,7 +534,9 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
// Sync on all distinct output directories.
for (auto dir : distinct_output_dirs) {
if (dir != nullptr) {
Status error_status = dir->Fsync(IOOptions(), nullptr);
Status error_status = dir->FsyncWithDirOptions(
IOOptions(), nullptr,
DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
if (!error_status.ok()) {
s = error_status;
break;
Expand Down
8 changes: 8 additions & 0 deletions db/db_impl/db_impl_debug.cc
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,14 @@ Status DBImpl::TEST_WaitForCompact(bool wait_unscheduled) {
return error_handler_.GetBGError();
}

Status DBImpl::TEST_WaitForPurge() {
InstrumentedMutexLock l(&mutex_);
while (bg_purge_scheduled_ && error_handler_.GetBGError().ok()) {
bg_cv_.Wait();
}
return error_handler_.GetBGError();
}

Status DBImpl::TEST_GetBGError() {
InstrumentedMutexLock l(&mutex_);
return error_handler_.GetBGError();
Expand Down
8 changes: 7 additions & 1 deletion db/db_impl/db_impl_files.cc
Original file line number Diff line number Diff line change
Expand Up @@ -655,7 +655,13 @@ void DBImpl::DeleteObsoleteFiles() {

mutex_.Unlock();
if (job_context.HaveSomethingToDelete()) {
PurgeObsoleteFiles(job_context);
bool defer_purge = immutable_db_options_.avoid_unnecessary_blocking_io;
PurgeObsoleteFiles(job_context, defer_purge);
if (defer_purge) {
mutex_.Lock();
SchedulePurge();
mutex_.Unlock();
}
}
job_context.Clean();
mutex_.Lock();
Expand Down
6 changes: 2 additions & 4 deletions db/db_impl/db_impl_open.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1694,10 +1694,6 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
if (impl->two_write_queues_) {
impl->log_write_mutex_.Unlock();
}

impl->DeleteObsoleteFiles();
s = impl->directories_.GetDbDir()->Fsync(IOOptions(), nullptr);
TEST_SYNC_POINT("DBImpl::Open:AfterDeleteFilesAndSyncDir");
}
if (s.ok()) {
// In WritePrepared there could be gap in sequence numbers. This breaks
Expand Down Expand Up @@ -1770,6 +1766,8 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,

*dbptr = impl;
impl->opened_successfully_ = true;
impl->DeleteObsoleteFiles();
TEST_SYNC_POINT("DBImpl::Open:AfterDeleteFiles");
impl->MaybeScheduleFlushOrCompaction();
} else {
persist_options_status.PermitUncheckedError();
Expand Down
4 changes: 3 additions & 1 deletion db/db_impl/db_impl_write.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1142,7 +1142,9 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
// We only sync WAL directory the first time WAL syncing is
// requested, so that in case users never turn on WAL sync,
// we can avoid the disk I/O in the write code path.
io_s = directories_.GetWalDir()->Fsync(IOOptions(), nullptr);
io_s = directories_.GetWalDir()->FsyncWithDirOptions(
IOOptions(), nullptr,
DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
}
}

Expand Down
2 changes: 1 addition & 1 deletion db/db_secondary_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -690,7 +690,7 @@ TEST_F(DBSecondaryTest, SwitchToNewManifestDuringOpen) {
SyncPoint::GetInstance()->LoadDependency(
{{"ReactiveVersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:0",
"VersionSet::ProcessManifestWrites:BeforeNewManifest"},
{"DBImpl::Open:AfterDeleteFilesAndSyncDir",
{"DBImpl::Open:AfterDeleteFiles",
"ReactiveVersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:"
"1"}});
SyncPoint::GetInstance()->EnableProcessing();
Expand Down
55 changes: 49 additions & 6 deletions db/deletefile_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -117,9 +117,15 @@ class DeleteFileTest : public DBTestBase {
manifest_cnt += (type == kDescriptorFile);
}
}
ASSERT_EQ(required_log, log_cnt);
ASSERT_EQ(required_sst, sst_cnt);
ASSERT_EQ(required_manifest, manifest_cnt);
if (required_log >= 0) {
ASSERT_EQ(required_log, log_cnt);
}
if (required_sst >= 0) {
ASSERT_EQ(required_sst, sst_cnt);
}
if (required_manifest >= 0) {
ASSERT_EQ(required_manifest, manifest_cnt);
}
}

static void DoSleep(void* arg) {
Expand Down Expand Up @@ -264,6 +270,41 @@ TEST_F(DeleteFileTest, BackgroundPurgeIteratorTest) {
CheckFileTypeCounts(dbname_, 0, 1, 1);
}

TEST_F(DeleteFileTest, PurgeDuringOpen) {
Options options = CurrentOptions();
CheckFileTypeCounts(dbname_, -1, 0, -1);
Close();
std::unique_ptr<WritableFile> file;
ASSERT_OK(options.env->NewWritableFile(dbname_ + "/000002.sst", &file,
EnvOptions()));
ASSERT_OK(file->Close());
CheckFileTypeCounts(dbname_, -1, 1, -1);
options.avoid_unnecessary_blocking_io = false;
options.create_if_missing = false;
Reopen(options);
CheckFileTypeCounts(dbname_, -1, 0, -1);
Close();

// test background purge
options.avoid_unnecessary_blocking_io = true;
options.create_if_missing = false;
ASSERT_OK(options.env->NewWritableFile(dbname_ + "/000002.sst", &file,
EnvOptions()));
ASSERT_OK(file->Close());
CheckFileTypeCounts(dbname_, -1, 1, -1);
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
SyncPoint::GetInstance()->LoadDependency(
{{"DeleteFileTest::PurgeDuringOpen:1", "DBImpl::BGWorkPurge:start"}});
SyncPoint::GetInstance()->EnableProcessing();
Reopen(options);
// the obsolete file is not deleted until the background purge job is ran
CheckFileTypeCounts(dbname_, -1, 1, -1);
TEST_SYNC_POINT("DeleteFileTest::PurgeDuringOpen:1");
ASSERT_OK(dbfull()->TEST_WaitForPurge());
CheckFileTypeCounts(dbname_, -1, 0, -1);
}

TEST_F(DeleteFileTest, BackgroundPurgeCFDropTest) {
Options options = CurrentOptions();
SetOptions(&options);
Expand Down Expand Up @@ -310,16 +351,18 @@ TEST_F(DeleteFileTest, BackgroundPurgeCFDropTest) {
do_test(false);
}

options.avoid_unnecessary_blocking_io = true;
options.create_if_missing = false;
Reopen(options);
ASSERT_OK(dbfull()->TEST_WaitForPurge());

SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
SyncPoint::GetInstance()->LoadDependency(
{{"DeleteFileTest::BackgroundPurgeCFDropTest:1",
"DBImpl::BGWorkPurge:start"}});
SyncPoint::GetInstance()->EnableProcessing();

options.avoid_unnecessary_blocking_io = true;
options.create_if_missing = false;
Reopen(options);
{
SCOPED_TRACE("avoid_unnecessary_blocking_io = true");
do_test(true);
Expand Down
4 changes: 3 additions & 1 deletion db/external_sst_file_ingestion_job.cc
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,9 @@ Status ExternalSstFileIngestionJob::Prepare(
TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncDir");
if (status.ok()) {
for (auto path_id : ingestion_path_ids) {
status = directories_->GetDataDir(path_id)->Fsync(IOOptions(), nullptr);
status = directories_->GetDataDir(path_id)->FsyncWithDirOptions(
IOOptions(), nullptr,
DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
if (!status.ok()) {
ROCKS_LOG_WARN(db_options_.info_log,
"Failed to sync directory %" ROCKSDB_PRIszt
Expand Down
4 changes: 3 additions & 1 deletion db/flush_job.cc
Original file line number Diff line number Diff line change
Expand Up @@ -937,7 +937,9 @@ Status FlushJob::WriteLevel0Table() {
meta_.marked_for_compaction ? " (needs compaction)" : "");

if (s.ok() && output_file_directory_ != nullptr && sync_output_directory_) {
s = output_file_directory_->Fsync(IOOptions(), nullptr);
s = output_file_directory_->FsyncWithDirOptions(
IOOptions(), nullptr,
DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
}
TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table", &mems_);
db_mutex_->Lock();
Expand Down
2 changes: 1 addition & 1 deletion env/composite_env.cc
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ class CompositeDirectoryWrapper : public Directory {
Status Fsync() override {
IOOptions io_opts;
IODebugContext dbg;
return target_->Fsync(io_opts, &dbg);
return target_->FsyncWithDirOptions(io_opts, &dbg, DirFsyncOptions());
}
size_t GetUniqueId(char* id, size_t max_size) const override {
return target_->GetUniqueId(id, max_size);
Expand Down
12 changes: 12 additions & 0 deletions env/file_system.cc
Original file line number Diff line number Diff line change
Expand Up @@ -241,4 +241,16 @@ std::string FileSystemWrapper::SerializeOptions(
}
}
#endif // ROCKSDB_LITE

DirFsyncOptions::DirFsyncOptions() { reason = kDefault; }

DirFsyncOptions::DirFsyncOptions(std::string file_renamed_new_name) {
reason = kFileRenamed;
renamed_new_name = file_renamed_new_name;
}

DirFsyncOptions::DirFsyncOptions(FsyncReason fsync_reason) {
assert(fsync_reason != kFileRenamed);
reason = fsync_reason;
}
} // namespace ROCKSDB_NAMESPACE
52 changes: 48 additions & 4 deletions env/io_posix.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1534,17 +1534,61 @@ PosixMemoryMappedFileBuffer::~PosixMemoryMappedFileBuffer() {
/*
* PosixDirectory
*/
#if !defined(BTRFS_SUPER_MAGIC)
// The magic number for BTRFS is fixed, if it's not defined, define it here
#define BTRFS_SUPER_MAGIC 0x9123683E
#endif
PosixDirectory::PosixDirectory(int fd) : fd_(fd) {
is_btrfs_ = false;
#ifdef OS_LINUX
struct statfs buf;
int ret = fstatfs(fd, &buf);
is_btrfs_ = (ret == 0 && buf.f_type == BTRFS_SUPER_MAGIC);
#endif
}

PosixDirectory::~PosixDirectory() { close(fd_); }

IOStatus PosixDirectory::Fsync(const IOOptions& /*opts*/,
IODebugContext* /*dbg*/) {
IOStatus PosixDirectory::Fsync(const IOOptions& opts, IODebugContext* dbg) {
return FsyncWithDirOptions(opts, dbg, DirFsyncOptions());
}

IOStatus PosixDirectory::FsyncWithDirOptions(
const IOOptions& /*opts*/, IODebugContext* /*dbg*/,
const DirFsyncOptions& dir_fsync_options) {
IOStatus s = IOStatus::OK();
#ifndef OS_AIX
if (is_btrfs_) {
// skip dir fsync for new file creation, which is not needed for btrfs
if (dir_fsync_options.reason == DirFsyncOptions::kNewFileSynced) {
return s;
}
// skip dir fsync for renaming file, only need to sync new file
if (dir_fsync_options.reason == DirFsyncOptions::kFileRenamed) {
std::string new_name = dir_fsync_options.renamed_new_name;
assert(!new_name.empty());
int fd;
do {
IOSTATS_TIMER_GUARD(open_nanos);
fd = open(new_name.c_str(), O_RDONLY);
} while (fd < 0 && errno == EINTR);
if (fd < 0) {
s = IOError("While open renaming file", new_name, errno);
} else if (fsync(fd) < 0) {
s = IOError("While fsync renaming file", new_name, errno);
}
if (close(fd) < 0) {
s = IOError("While closing file after fsync", new_name, errno);
}
return s;
}
// fallback to dir-fsync for kDefault, kDirRenamed and kFileDeleted
}
if (fsync(fd_) == -1) {
return IOError("While fsync", "a directory", errno);
s = IOError("While fsync", "a directory", errno);
}
#endif
return IOStatus::OK();
return s;
}
} // namespace ROCKSDB_NAMESPACE
#endif
7 changes: 6 additions & 1 deletion env/io_posix.h
Original file line number Diff line number Diff line change
Expand Up @@ -391,12 +391,17 @@ struct PosixMemoryMappedFileBuffer : public MemoryMappedFileBuffer {

class PosixDirectory : public FSDirectory {
public:
explicit PosixDirectory(int fd) : fd_(fd) {}
explicit PosixDirectory(int fd);
~PosixDirectory();
virtual IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override;

virtual IOStatus FsyncWithDirOptions(
const IOOptions&, IODebugContext*,
const DirFsyncOptions& dir_fsync_options) override;

private:
int fd_;
bool is_btrfs_;
};

} // namespace ROCKSDB_NAMESPACE
Loading

0 comments on commit 2910264

Please sign in to comment.