eBay · hkadayam · Jun 6, 2024 · May 15, 2024 · May 15, 2024 · Jun 6, 2024
diff --git a/conanfile.py b/conanfile.py
@@ -5,7 +5,7 @@
 
 class HomestoreConan(ConanFile):
     name = "homestore"
-    version = "6.4.6"
+    version = "6.4.7"
 
     homepage = "https://github.com/eBay/Homestore"
     description = "HomeStore Storage Engine"

diff --git a/src/include/homestore/homestore.hpp b/src/include/homestore/homestore.hpp
@@ -54,6 +54,10 @@ class ChunkSelector;
 class ReplDevListener;
 class ReplApplication;
 
+#ifdef _PRERELEASE
+class CrashSimulator;
+#endif
+
 using HomeStoreSafePtr = std::shared_ptr< HomeStore >;
 
 VENUM(hs_vdev_type_t, uint32_t, DATA_VDEV = 1, INDEX_VDEV = 2, META_VDEV = 3, LOGDEV_VDEV = 4);
@@ -170,6 +174,12 @@ class HomeStore {
     CPManager& cp_mgr() { return *m_cp_mgr.get(); }
     shared< sisl::Evictor > evictor() { return m_evictor; }
 
+#ifdef _PRERELEASE
+    HomeStore& with_crash_simulator(std::function< void(void) > restart_cb);
+    CrashSimulator& crash_simulator() { return *m_crash_simulator; }
+    unique< CrashSimulator > m_crash_simulator;
+#endif
+
 private:
     void init_cache();
     shared< VirtualDev > create_vdev_cb(const vdev_info& vinfo, bool load_existing);

diff --git a/src/lib/common/crash_simulator.hpp b/src/lib/common/crash_simulator.hpp
@@ -0,0 +1,43 @@
+#pragma once
+
+#ifdef _PRERELEASE
+#include <functional>
+#include <sisl/utility/urcu_helper.hpp>
+#include <iomgr/iomgr_flip.hpp>
+
+namespace homestore {
+
+class CrashSimulator {
+public:
+    CrashSimulator(std::function< void(void) > cb = nullptr) : m_restart_cb{cb} {}
+    ~CrashSimulator() = default;
+
+    void crash() {
+        if (m_restart_cb) {
+            m_crashed.update([](auto* s) { *s = true; });
+
+            // We can restart on a new thread to allow other operations to continue
+            std::thread t([this]() { m_restart_cb(); });
+            t.detach();
+        } else {
+            raise(SIGKILL);
+        }
+    }
+
+    bool is_crashed() const { return ((m_restart_cb != nullptr) && *(m_crashed.access().get())); }
+
+    bool crash_if_flip_set(const std::string& flip_name) {
+        if (iomgr_flip::instance()->test_flip(flip_name)) {
+            this->crash();
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+private:
+    std::function< void(void) > m_restart_cb{nullptr};
+    sisl::urcu_scoped_ptr< bool > m_crashed;
+};
+} // namespace homestore
+#endif
diff --git a/src/lib/device/journal_vdev.cpp b/src/lib/device/journal_vdev.cpp
@@ -34,6 +34,7 @@
 #include "common/homestore_assert.hpp"
 #include "common/homestore_utils.hpp"
 #include "common/resource_mgr.hpp"
+#include "common/crash_simulator.hpp"
 
 SISL_LOGGING_DECL(journalvdev)
 
@@ -45,15 +46,15 @@ JournalVirtualDev::JournalVirtualDev(DeviceManager& dmgr, const vdev_info& vinfo
     m_init_private_data = std::make_shared< JournalChunkPrivate >();
     m_chunk_pool = std::make_unique< ChunkPool >(
         dmgr,
-        ChunkPool::Params{
-            HS_DYNAMIC_CONFIG(generic.journal_chunk_pool_capacity),
-            [this]() {
-                m_init_private_data->created_at = get_time_since_epoch_ms();
-                m_init_private_data->end_of_chunk = m_vdev_info.chunk_size;
-                sisl::blob private_blob{r_cast< uint8_t* >(m_init_private_data.get()), sizeof(JournalChunkPrivate)};
-                return private_blob;
-            },
-            m_vdev_info.hs_dev_type, m_vdev_info.vdev_id, m_vdev_info.chunk_size});
+        ChunkPool::Params{HS_DYNAMIC_CONFIG(generic.journal_chunk_pool_capacity),
+                          [this]() {
+                              m_init_private_data->created_at = get_time_since_epoch_ms();
+                              m_init_private_data->end_of_chunk = m_vdev_info.chunk_size;
+                              sisl::blob private_blob{r_cast< uint8_t* >(m_init_private_data.get()),
+                                                      sizeof(JournalChunkPrivate)};
+                              return private_blob;
+                          },
+                          m_vdev_info.hs_dev_type, m_vdev_info.vdev_id, m_vdev_info.chunk_size});
 
     resource_mgr().register_journal_vdev_exceed_cb([this]([[maybe_unused]] int64_t dirty_buf_count, bool critical) {
         // either it is critical or non-critical, call cp_flush;
@@ -245,14 +246,14 @@ off_t JournalVirtualDev::Descriptor::alloc_next_append_blk(size_t sz) {
         LOGDEBUGMOD(journalvdev, "No space left for size {} Creating chunk desc {}", sz, to_string());
 
 #ifdef _PRERELEASE
-        iomgr_flip::test_and_abort("abort_before_update_eof_cur_chunk");
+        if (hs()->crash_simulator().crash_if_flip_set("abort_before_update_eof_cur_chunk")) { return tail_offset(); }
 #endif
 
         // Append a chunk to m_journal_chunks list. This will increase the m_end_offset.
         append_chunk();
 
 #ifdef _PRERELEASE
-        iomgr_flip::test_and_abort("abort_after_update_eof_next_chunk");
+        if (hs()->crash_simulator().crash_if_flip_set("abort_after_update_eof_next_chunk")) { return tail_offset(); }
 #endif
 
         RELEASE_ASSERT((tail_offset() + static_cast< off_t >(sz)) < m_end_offset, "No space for append blk");

diff --git a/src/lib/device/virtual_dev.cpp b/src/lib/device/virtual_dev.cpp
@@ -42,6 +42,7 @@
 #include "common/error.h"
 #include "common/homestore_assert.hpp"
 #include "common/homestore_utils.hpp"
+#include "common/crash_simulator.hpp"
 #include "blkalloc/varsize_blk_allocator.h"
 #include "device/round_robin_chunk_selector.h"
 #include "blkalloc/append_blk_allocator.h"
@@ -316,6 +317,10 @@ folly::Future< std::error_code > VirtualDev::async_write(const char* buf, uint32
                                                          bool part_of_batch) {
     HS_DBG_ASSERT_EQ(bid.is_multi(), false, "async_write needs individual pieces of blkid - not MultiBlkid");
 
+#ifdef _PRERELEASE
+    if (hs()->crash_simulator().is_crashed()) { return folly::makeFuture< std::error_code >(std::error_code()); }
+#endif
+
     Chunk* chunk;
     uint64_t const dev_offset = to_dev_offset(bid, &chunk);
     if (sisl_unlikely(dev_offset == INVALID_DEV_OFFSET)) {
@@ -334,6 +339,10 @@ folly::Future< std::error_code > VirtualDev::async_write(const char* buf, uint32
 
 folly::Future< std::error_code > VirtualDev::async_write(const char* buf, uint32_t size, cshared< Chunk >& chunk,
                                                          uint64_t offset_in_chunk) {
+#ifdef _PRERELEASE
+    if (hs()->crash_simulator().is_crashed()) { return folly::makeFuture< std::error_code >(std::error_code()); }
+#endif
+
     if (sisl_unlikely(!is_chunk_available(chunk))) {
         return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::resource_unavailable_try_again));
     }
@@ -351,6 +360,9 @@ folly::Future< std::error_code > VirtualDev::async_write(const char* buf, uint32
 folly::Future< std::error_code > VirtualDev::async_writev(const iovec* iov, const int iovcnt, BlkId const& bid,
                                                           bool part_of_batch) {
     HS_DBG_ASSERT_EQ(bid.is_multi(), false, "async_writev needs individual pieces of blkid - not MultiBlkid");
+#ifdef _PRERELEASE
+    if (hs()->crash_simulator().is_crashed()) { return folly::makeFuture< std::error_code >(std::error_code()); }
+#endif
 
     Chunk* chunk;
     uint64_t const dev_offset = to_dev_offset(bid, &chunk);
@@ -370,6 +382,10 @@ folly::Future< std::error_code > VirtualDev::async_writev(const iovec* iov, cons
 
 folly::Future< std::error_code > VirtualDev::async_writev(const iovec* iov, const int iovcnt, cshared< Chunk >& chunk,
                                                           uint64_t offset_in_chunk) {
+#ifdef _PRERELEASE
+    if (hs()->crash_simulator().is_crashed()) { return folly::makeFuture< std::error_code >(std::error_code()); }
+#endif
+
     if (sisl_unlikely(!is_chunk_available(chunk))) {
         return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::resource_unavailable_try_again));
     }
@@ -387,6 +403,10 @@ folly::Future< std::error_code > VirtualDev::async_writev(const iovec* iov, cons
 
 ////////////////////////// sync write section //////////////////////////////////
 std::error_code VirtualDev::sync_write(const char* buf, uint32_t size, BlkId const& bid) {
+#ifdef _PRERELEASE
+    if (hs()->crash_simulator().is_crashed()) { return std::error_code{}; }
+#endif
+
     HS_DBG_ASSERT_EQ(bid.is_multi(), false, "sync_write needs individual pieces of blkid - not MultiBlkid");
 
     Chunk* chunk;
@@ -399,6 +419,10 @@ std::error_code VirtualDev::sync_write(const char* buf, uint32_t size, BlkId con
 
 std::error_code VirtualDev::sync_write(const char* buf, uint32_t size, cshared< Chunk >& chunk,
                                        uint64_t offset_in_chunk) {
+#ifdef _PRERELEASE
+    if (hs()->crash_simulator().is_crashed()) { return std::error_code{}; }
+#endif
+
     if (sisl_unlikely(!is_chunk_available(chunk))) {
         return std::make_error_code(std::errc::resource_unavailable_try_again);
     }
@@ -408,6 +432,10 @@ std::error_code VirtualDev::sync_write(const char* buf, uint32_t size, cshared<
 std::error_code VirtualDev::sync_writev(const iovec* iov, int iovcnt, BlkId const& bid) {
     HS_DBG_ASSERT_EQ(bid.is_multi(), false, "sync_writev needs individual pieces of blkid - not MultiBlkid");
 
+#ifdef _PRERELEASE
+    if (hs()->crash_simulator().is_crashed()) { return std::error_code{}; }
+#endif
+
     Chunk* chunk;
     uint64_t const dev_offset = to_dev_offset(bid, &chunk);
     if (sisl_unlikely(dev_offset == INVALID_DEV_OFFSET)) {
@@ -426,9 +454,14 @@ std::error_code VirtualDev::sync_writev(const iovec* iov, int iovcnt, BlkId cons
 
 std::error_code VirtualDev::sync_writev(const iovec* iov, int iovcnt, cshared< Chunk >& chunk,
                                         uint64_t offset_in_chunk) {
+#ifdef _PRERELEASE
+    if (hs()->crash_simulator().is_crashed()) { return std::error_code{}; }
+#endif
+
     if (sisl_unlikely(!is_chunk_available(chunk))) {
         return std::make_error_code(std::errc::resource_unavailable_try_again);
     }
+
     uint64_t const dev_offset = chunk->start_offset() + offset_in_chunk;
     auto const size = get_len(iov, iovcnt);
     auto* pdev = chunk->physical_dev_mutable();

diff --git a/src/lib/homestore.cpp b/src/lib/homestore.cpp
@@ -41,6 +41,7 @@
 #include "common/resource_mgr.hpp"
 #include "meta/meta_sb.hpp"
 #include "replication/service/generic_repl_svc.h"
+#include "common/crash_simulator.hpp"
 
 /*
  * IO errors handling by homestore.
@@ -90,6 +91,13 @@ HomeStore& HomeStore::with_repl_data_service(cshared< ReplApplication >& repl_ap
     return *this;
 }
 
+#ifdef _PRERELEASE
+HomeStore& HomeStore::with_crash_simulator(std::function< void(void) > cb) {
+    m_crash_simulator = std::make_unique< CrashSimulator >(std::move(cb));
+    return *this;
+}
+#endif
+
 bool HomeStore::start(const hs_input_params& input, hs_before_services_starting_cb_t svcs_starting_cb) {
     auto& hs_config = HomeStoreStaticConfig::instance();
     hs_config.input = input;
@@ -120,6 +128,12 @@ bool HomeStore::start(const hs_input_params& input, hs_before_services_starting_
 
     HomeStoreDynamicConfig::init_settings_default();
 
+#ifdef _PRERELEASE
+    // Start a default crash simulator which raises SIGKILL, in case user has not provided with_crash_simulator()
+    // callback
+    if (m_crash_simulator == nullptr) { m_crash_simulator = std::make_unique< CrashSimulator >(nullptr); }
+#endif
+
     LOGINFO("Homestore is loading with following services: {}", m_services.list());
     if (has_meta_service()) { m_meta_service = std::make_unique< MetaBlkService >(); }
     if (has_index_service()) { m_index_service = std::make_unique< IndexService >(std::move(s_index_cbs)); }
@@ -151,7 +165,6 @@ bool HomeStore::start(const hs_input_params& input, hs_before_services_starting_
 }
 
 void HomeStore::format_and_start(std::map< uint32_t, hs_format_params >&& format_opts) {
-
     std::map< HSDevType, float > total_pct_by_type = {{HSDevType::Fast, 0.0f}, {HSDevType::Data, 0.0f}};
     // Accumulate total percentage of services on each device type
     for (const auto& [svc_type, fparams] : format_opts) {
@@ -188,7 +201,6 @@ void HomeStore::format_and_start(std::map< uint32_t, hs_format_params >&& format
         hs_utils::set_btree_mempool_size(m_dev_mgr->atomic_page_size({HSDevType::Data}));
     }
 
-
     std::vector< folly::Future< std::error_code > > futs;
     for (const auto& [svc_type, fparams] : format_opts) {
         if (fparams.size_pct == 0) { continue; }

diff --git a/src/lib/logstore/log_dev.cpp b/src/lib/logstore/log_dev.cpp
@@ -30,6 +30,7 @@
 #include "common/homestore_assert.hpp"
 #include "common/homestore_config.hpp"
 #include "common/homestore_utils.hpp"
+#include "common/crash_simulator.hpp"
 
 namespace homestore {
 
@@ -604,7 +605,8 @@ uint64_t LogDev::truncate(const logdev_key& key) {
 #ifdef _PRERELEASE
             if (garbage_collect && iomgr_flip::instance()->test_flip("logdev_abort_after_garbage")) {
                 THIS_LOGDEV_LOG(INFO, "logdev aborting after unreserving garbage ids");
-                raise(SIGKILL);
+                hs()->crash_simulator().crash();
+                return num_records_to_truncate;
             }
 #endif
         }

diff --git a/src/lib/meta/meta_blk_service.cpp b/src/lib/meta/meta_blk_service.cpp
@@ -32,6 +32,7 @@
 #include <homestore/chunk_selector.h>
 #include "common/homestore_flip.hpp"
 #include "common/homestore_utils.hpp"
+#include "common/crash_simulator.hpp"
 #include "device/device.h"
 #include "device/virtual_dev.hpp"
 #include "device/physical_dev.hpp"
@@ -705,7 +706,7 @@ void MetaBlkService::write_meta_blk_internal(meta_blk* mblk, const uint8_t* cont
         mblk->hdr.h.ovf_bid = obid;
 
 #ifdef _PRERELEASE
-        iomgr_flip::test_and_abort("write_with_ovf_abort");
+        if (hs()->crash_simulator().crash_if_flip_set("write_with_ovf_abort")) { return; }
 #endif
     }
 
@@ -718,7 +719,7 @@ void MetaBlkService::write_meta_blk_internal(meta_blk* mblk, const uint8_t* cont
     write_meta_blk_to_disk(mblk);
 
 #ifdef _PRERELEASE
-    iomgr_flip::test_and_abort("write_sb_abort");
+    if (hs()->crash_simulator().crash_if_flip_set("write_sb_abort")) { return; }
 #endif
 }
 
@@ -807,7 +808,7 @@ void MetaBlkService::update_sub_sb(const uint8_t* context_data, uint64_t sz, voi
     write_meta_blk_internal(mblk, context_data, sz);
 
 #ifdef _PRERELEASE
-    iomgr_flip::test_and_abort("update_sb_abort");
+    if (hs()->crash_simulator().crash_if_flip_set("update_sb_abort")) { return; }
 #endif
 
     // free the overflow bid if it is there
@@ -906,7 +907,7 @@ std::error_condition MetaBlkService::remove_sub_sb(void* cookie) {
     free_meta_blk(rm_blk);
 
 #ifdef _PRERELEASE
-    iomgr_flip::test_and_abort("remove_sb_abort");
+    if (hs()->crash_simulator().crash_if_flip_set("remove_sb_abort")) { return no_error; }
 #endif
 
     HS_LOG(DEBUG, metablk, "after remove, mstore used size: {}", m_sb_vdev->used_size());

diff --git a/src/tests/test_common/homestore_test_common.hpp b/src/tests/test_common/homestore_test_common.hpp
@@ -175,7 +175,7 @@ class HSTestHelper {
         vdev_size_type_t vdev_size_type{vdev_size_type_t::VDEV_SIZE_STATIC};
     };
 
-    static void start_homestore(const std::string& test_name, std::map< uint32_t, test_params >&& svc_params,
+    static void start_homestore(const std::string& test_name, std::map< uint32_t, test_params > const& svc_params_tmp,
                                 hs_before_services_starting_cb_t cb = nullptr, bool fake_restart = false,
                                 bool init_device = true, uint32_t shutdown_delay_sec = 5) {
         auto const ndevices = SISL_OPTIONS["num_devs"].as< uint32_t >();
@@ -189,6 +189,7 @@ class HSTestHelper {
             std::this_thread::sleep_for(std::chrono::seconds{shutdown_delay_sec});
         }
 
+        std::map< uint32_t, test_params > svc_params = std::move(svc_params_tmp);
         std::vector< homestore::dev_info > device_info;
         if (SISL_OPTIONS.count("device_list")) {
             s_dev_names = SISL_OPTIONS["device_list"].as< std::vector< std::string > >();
@@ -244,6 +245,11 @@ class HSTestHelper {
                 hsi->with_repl_data_service(tp.repl_app, tp.custom_chunk_selector);
             }
         }
+        hsi->with_crash_simulator([=]() {
+            LOGINFO("CrashSimulator::crash() is called - restarting homestore");
+            start_homestore(test_name, svc_params, cb, true /* fake_restart */, false /* init_device */);
+        });
+
         bool need_format =
             hsi->start(hs_input_params{.devices = device_info, .app_mem_size = app_mem_size}, std::move(cb));