Skip to content

Commit

Permalink
Crash simulator facility without actually SIGKILL (#423)
Browse files Browse the repository at this point in the history
* Crash simulator facility without actually SIGKILL
  • Loading branch information
hkadayam committed Jun 6, 2024
1 parent dedf127 commit 3748a90
Show file tree
Hide file tree
Showing 9 changed files with 130 additions and 20 deletions.
2 changes: 1 addition & 1 deletion conanfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

class HomestoreConan(ConanFile):
name = "homestore"
version = "6.4.11"
version = "6.4.12"

homepage = "https://github.com/eBay/Homestore"
description = "HomeStore Storage Engine"
Expand Down
10 changes: 10 additions & 0 deletions src/include/homestore/homestore.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,10 @@ class ChunkSelector;
class ReplDevListener;
class ReplApplication;

#ifdef _PRERELEASE
class CrashSimulator;
#endif

using HomeStoreSafePtr = std::shared_ptr< HomeStore >;

VENUM(hs_vdev_type_t, uint32_t, DATA_VDEV = 1, INDEX_VDEV = 2, META_VDEV = 3, LOGDEV_VDEV = 4);
Expand Down Expand Up @@ -170,6 +174,12 @@ class HomeStore {
CPManager& cp_mgr() { return *m_cp_mgr.get(); }
shared< sisl::Evictor > evictor() { return m_evictor; }

#ifdef _PRERELEASE
HomeStore& with_crash_simulator(std::function< void(void) > restart_cb);
CrashSimulator& crash_simulator() { return *m_crash_simulator; }
unique< CrashSimulator > m_crash_simulator;
#endif

private:
void init_cache();
shared< VirtualDev > create_vdev_cb(const vdev_info& vinfo, bool load_existing);
Expand Down
43 changes: 43 additions & 0 deletions src/lib/common/crash_simulator.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#pragma once

#ifdef _PRERELEASE
#include <functional>
#include <sisl/utility/urcu_helper.hpp>
#include <iomgr/iomgr_flip.hpp>

namespace homestore {

class CrashSimulator {
public:
CrashSimulator(std::function< void(void) > cb = nullptr) : m_restart_cb{cb} {}
~CrashSimulator() = default;

void crash() {
if (m_restart_cb) {
m_crashed.update([](auto* s) { *s = true; });

// We can restart on a new thread to allow other operations to continue
std::thread t([this]() { m_restart_cb(); });
t.detach();
} else {
raise(SIGKILL);
}
}

bool is_crashed() const { return ((m_restart_cb != nullptr) && *(m_crashed.access().get())); }

bool crash_if_flip_set(const std::string& flip_name) {
if (iomgr_flip::instance()->test_flip(flip_name)) {
this->crash();
return true;
} else {
return false;
}
}

private:
std::function< void(void) > m_restart_cb{nullptr};
sisl::urcu_scoped_ptr< bool > m_crashed;
};
} // namespace homestore
#endif
23 changes: 12 additions & 11 deletions src/lib/device/journal_vdev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include "common/homestore_assert.hpp"
#include "common/homestore_utils.hpp"
#include "common/resource_mgr.hpp"
#include "common/crash_simulator.hpp"

SISL_LOGGING_DECL(journalvdev)

Expand All @@ -45,15 +46,15 @@ JournalVirtualDev::JournalVirtualDev(DeviceManager& dmgr, const vdev_info& vinfo
m_init_private_data = std::make_shared< JournalChunkPrivate >();
m_chunk_pool = std::make_unique< ChunkPool >(
dmgr,
ChunkPool::Params{
HS_DYNAMIC_CONFIG(generic.journal_chunk_pool_capacity),
[this]() {
m_init_private_data->created_at = get_time_since_epoch_ms();
m_init_private_data->end_of_chunk = m_vdev_info.chunk_size;
sisl::blob private_blob{r_cast< uint8_t* >(m_init_private_data.get()), sizeof(JournalChunkPrivate)};
return private_blob;
},
m_vdev_info.hs_dev_type, m_vdev_info.vdev_id, m_vdev_info.chunk_size});
ChunkPool::Params{HS_DYNAMIC_CONFIG(generic.journal_chunk_pool_capacity),
[this]() {
m_init_private_data->created_at = get_time_since_epoch_ms();
m_init_private_data->end_of_chunk = m_vdev_info.chunk_size;
sisl::blob private_blob{r_cast< uint8_t* >(m_init_private_data.get()),
sizeof(JournalChunkPrivate)};
return private_blob;
},
m_vdev_info.hs_dev_type, m_vdev_info.vdev_id, m_vdev_info.chunk_size});

resource_mgr().register_journal_vdev_exceed_cb([this]([[maybe_unused]] int64_t dirty_buf_count, bool critical) {
// either it is critical or non-critical, call cp_flush;
Expand Down Expand Up @@ -245,14 +246,14 @@ off_t JournalVirtualDev::Descriptor::alloc_next_append_blk(size_t sz) {
LOGDEBUGMOD(journalvdev, "No space left for size {} Creating chunk desc {}", sz, to_string());

#ifdef _PRERELEASE
iomgr_flip::test_and_abort("abort_before_update_eof_cur_chunk");
if (hs()->crash_simulator().crash_if_flip_set("abort_before_update_eof_cur_chunk")) { return tail_offset(); }
#endif

// Append a chunk to m_journal_chunks list. This will increase the m_end_offset.
append_chunk();

#ifdef _PRERELEASE
iomgr_flip::test_and_abort("abort_after_update_eof_next_chunk");
if (hs()->crash_simulator().crash_if_flip_set("abort_after_update_eof_next_chunk")) { return tail_offset(); }
#endif

RELEASE_ASSERT((tail_offset() + static_cast< off_t >(sz)) < m_end_offset, "No space for append blk");
Expand Down
33 changes: 33 additions & 0 deletions src/lib/device/virtual_dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
#include "common/error.h"
#include "common/homestore_assert.hpp"
#include "common/homestore_utils.hpp"
#include "common/crash_simulator.hpp"
#include "blkalloc/varsize_blk_allocator.h"
#include "device/round_robin_chunk_selector.h"
#include "blkalloc/append_blk_allocator.h"
Expand Down Expand Up @@ -326,6 +327,10 @@ folly::Future< std::error_code > VirtualDev::async_write(const char* buf, uint32
bool part_of_batch) {
HS_DBG_ASSERT_EQ(bid.is_multi(), false, "async_write needs individual pieces of blkid - not MultiBlkid");

#ifdef _PRERELEASE
if (hs()->crash_simulator().is_crashed()) { return folly::makeFuture< std::error_code >(std::error_code()); }
#endif

Chunk* chunk;
uint64_t const dev_offset = to_dev_offset(bid, &chunk);
if (sisl_unlikely(dev_offset == INVALID_DEV_OFFSET)) {
Expand All @@ -344,6 +349,10 @@ folly::Future< std::error_code > VirtualDev::async_write(const char* buf, uint32

folly::Future< std::error_code > VirtualDev::async_write(const char* buf, uint32_t size, cshared< Chunk >& chunk,
uint64_t offset_in_chunk) {
#ifdef _PRERELEASE
if (hs()->crash_simulator().is_crashed()) { return folly::makeFuture< std::error_code >(std::error_code()); }
#endif

if (sisl_unlikely(!is_chunk_available(chunk))) {
return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::resource_unavailable_try_again));
}
Expand All @@ -361,6 +370,9 @@ folly::Future< std::error_code > VirtualDev::async_write(const char* buf, uint32
folly::Future< std::error_code > VirtualDev::async_writev(const iovec* iov, const int iovcnt, BlkId const& bid,
bool part_of_batch) {
HS_DBG_ASSERT_EQ(bid.is_multi(), false, "async_writev needs individual pieces of blkid - not MultiBlkid");
#ifdef _PRERELEASE
if (hs()->crash_simulator().is_crashed()) { return folly::makeFuture< std::error_code >(std::error_code()); }
#endif

Chunk* chunk;
uint64_t const dev_offset = to_dev_offset(bid, &chunk);
Expand All @@ -380,6 +392,10 @@ folly::Future< std::error_code > VirtualDev::async_writev(const iovec* iov, cons

folly::Future< std::error_code > VirtualDev::async_writev(const iovec* iov, const int iovcnt, cshared< Chunk >& chunk,
uint64_t offset_in_chunk) {
#ifdef _PRERELEASE
if (hs()->crash_simulator().is_crashed()) { return folly::makeFuture< std::error_code >(std::error_code()); }
#endif

if (sisl_unlikely(!is_chunk_available(chunk))) {
return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::resource_unavailable_try_again));
}
Expand All @@ -397,6 +413,10 @@ folly::Future< std::error_code > VirtualDev::async_writev(const iovec* iov, cons

////////////////////////// sync write section //////////////////////////////////
std::error_code VirtualDev::sync_write(const char* buf, uint32_t size, BlkId const& bid) {
#ifdef _PRERELEASE
if (hs()->crash_simulator().is_crashed()) { return std::error_code{}; }
#endif

HS_DBG_ASSERT_EQ(bid.is_multi(), false, "sync_write needs individual pieces of blkid - not MultiBlkid");

Chunk* chunk;
Expand All @@ -409,6 +429,10 @@ std::error_code VirtualDev::sync_write(const char* buf, uint32_t size, BlkId con

std::error_code VirtualDev::sync_write(const char* buf, uint32_t size, cshared< Chunk >& chunk,
uint64_t offset_in_chunk) {
#ifdef _PRERELEASE
if (hs()->crash_simulator().is_crashed()) { return std::error_code{}; }
#endif

if (sisl_unlikely(!is_chunk_available(chunk))) {
return std::make_error_code(std::errc::resource_unavailable_try_again);
}
Expand All @@ -418,6 +442,10 @@ std::error_code VirtualDev::sync_write(const char* buf, uint32_t size, cshared<
std::error_code VirtualDev::sync_writev(const iovec* iov, int iovcnt, BlkId const& bid) {
HS_DBG_ASSERT_EQ(bid.is_multi(), false, "sync_writev needs individual pieces of blkid - not MultiBlkid");

#ifdef _PRERELEASE
if (hs()->crash_simulator().is_crashed()) { return std::error_code{}; }
#endif

Chunk* chunk;
uint64_t const dev_offset = to_dev_offset(bid, &chunk);
if (sisl_unlikely(dev_offset == INVALID_DEV_OFFSET)) {
Expand All @@ -436,9 +464,14 @@ std::error_code VirtualDev::sync_writev(const iovec* iov, int iovcnt, BlkId cons

std::error_code VirtualDev::sync_writev(const iovec* iov, int iovcnt, cshared< Chunk >& chunk,
uint64_t offset_in_chunk) {
#ifdef _PRERELEASE
if (hs()->crash_simulator().is_crashed()) { return std::error_code{}; }
#endif

if (sisl_unlikely(!is_chunk_available(chunk))) {
return std::make_error_code(std::errc::resource_unavailable_try_again);
}

uint64_t const dev_offset = chunk->start_offset() + offset_in_chunk;
auto const size = get_len(iov, iovcnt);
auto* pdev = chunk->physical_dev_mutable();
Expand Down
16 changes: 14 additions & 2 deletions src/lib/homestore.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
#include "common/resource_mgr.hpp"
#include "meta/meta_sb.hpp"
#include "replication/service/generic_repl_svc.h"
#include "common/crash_simulator.hpp"

/*
* IO errors handling by homestore.
Expand Down Expand Up @@ -90,6 +91,13 @@ HomeStore& HomeStore::with_repl_data_service(cshared< ReplApplication >& repl_ap
return *this;
}

#ifdef _PRERELEASE
HomeStore& HomeStore::with_crash_simulator(std::function< void(void) > cb) {
m_crash_simulator = std::make_unique< CrashSimulator >(std::move(cb));
return *this;
}
#endif

bool HomeStore::start(const hs_input_params& input, hs_before_services_starting_cb_t svcs_starting_cb) {
auto& hs_config = HomeStoreStaticConfig::instance();
hs_config.input = input;
Expand Down Expand Up @@ -120,6 +128,12 @@ bool HomeStore::start(const hs_input_params& input, hs_before_services_starting_

HomeStoreDynamicConfig::init_settings_default();

#ifdef _PRERELEASE
// Start a default crash simulator which raises SIGKILL, in case user has not provided with_crash_simulator()
// callback
if (m_crash_simulator == nullptr) { m_crash_simulator = std::make_unique< CrashSimulator >(nullptr); }
#endif

LOGINFO("Homestore is loading with following services: {}", m_services.list());
if (has_meta_service()) { m_meta_service = std::make_unique< MetaBlkService >(); }
if (has_index_service()) { m_index_service = std::make_unique< IndexService >(std::move(s_index_cbs)); }
Expand Down Expand Up @@ -151,7 +165,6 @@ bool HomeStore::start(const hs_input_params& input, hs_before_services_starting_
}

void HomeStore::format_and_start(std::map< uint32_t, hs_format_params >&& format_opts) {

std::map< HSDevType, float > total_pct_by_type = {{HSDevType::Fast, 0.0f}, {HSDevType::Data, 0.0f}};
// Accumulate total percentage of services on each device type
for (const auto& [svc_type, fparams] : format_opts) {
Expand Down Expand Up @@ -188,7 +201,6 @@ void HomeStore::format_and_start(std::map< uint32_t, hs_format_params >&& format
hs_utils::set_btree_mempool_size(m_dev_mgr->atomic_page_size({HSDevType::Data}));
}


std::vector< folly::Future< std::error_code > > futs;
for (const auto& [svc_type, fparams] : format_opts) {
if (fparams.size_pct == 0) { continue; }
Expand Down
4 changes: 3 additions & 1 deletion src/lib/logstore/log_dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include "common/homestore_assert.hpp"
#include "common/homestore_config.hpp"
#include "common/homestore_utils.hpp"
#include "common/crash_simulator.hpp"

namespace homestore {

Expand Down Expand Up @@ -604,7 +605,8 @@ uint64_t LogDev::truncate(const logdev_key& key) {
#ifdef _PRERELEASE
if (garbage_collect && iomgr_flip::instance()->test_flip("logdev_abort_after_garbage")) {
THIS_LOGDEV_LOG(INFO, "logdev aborting after unreserving garbage ids");
raise(SIGKILL);
hs()->crash_simulator().crash();
return num_records_to_truncate;
}
#endif
}
Expand Down
9 changes: 5 additions & 4 deletions src/lib/meta/meta_blk_service.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#include <homestore/chunk_selector.h>
#include "common/homestore_flip.hpp"
#include "common/homestore_utils.hpp"
#include "common/crash_simulator.hpp"
#include "device/device.h"
#include "device/virtual_dev.hpp"
#include "device/physical_dev.hpp"
Expand Down Expand Up @@ -732,7 +733,7 @@ void MetaBlkService::write_meta_blk_internal(meta_blk* mblk, const uint8_t* cont
mblk->hdr.h.ovf_bid = obid;

#ifdef _PRERELEASE
iomgr_flip::test_and_abort("write_with_ovf_abort");
if (hs()->crash_simulator().crash_if_flip_set("write_with_ovf_abort")) { return; }
#endif
}

Expand All @@ -745,7 +746,7 @@ void MetaBlkService::write_meta_blk_internal(meta_blk* mblk, const uint8_t* cont
write_meta_blk_to_disk(mblk);

#ifdef _PRERELEASE
iomgr_flip::test_and_abort("write_sb_abort");
if (hs()->crash_simulator().crash_if_flip_set("write_sb_abort")) { return; }
#endif
}

Expand Down Expand Up @@ -834,7 +835,7 @@ void MetaBlkService::update_sub_sb(const uint8_t* context_data, uint64_t sz, voi
write_meta_blk_internal(mblk, context_data, sz);

#ifdef _PRERELEASE
iomgr_flip::test_and_abort("update_sb_abort");
if (hs()->crash_simulator().crash_if_flip_set("update_sb_abort")) { return; }
#endif

// free the overflow bid if it is there
Expand Down Expand Up @@ -933,7 +934,7 @@ std::error_condition MetaBlkService::remove_sub_sb(void* cookie) {
free_meta_blk(rm_blk);

#ifdef _PRERELEASE
iomgr_flip::test_and_abort("remove_sb_abort");
if (hs()->crash_simulator().crash_if_flip_set("remove_sb_abort")) { return no_error; }
#endif

HS_LOG(DEBUG, metablk, "after remove, mstore used size: {}", m_sb_vdev->used_size());
Expand Down
10 changes: 9 additions & 1 deletion src/tests/test_common/homestore_test_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ class HSTestHelper {
vdev_size_type_t vdev_size_type{vdev_size_type_t::VDEV_SIZE_STATIC};
};

static void start_homestore(const std::string& test_name, std::map< uint32_t, test_params >&& svc_params,
static void start_homestore(const std::string& test_name, std::map< uint32_t, test_params > const& svc_params_tmp,
hs_before_services_starting_cb_t cb = nullptr, bool fake_restart = false,
bool init_device = true, uint32_t shutdown_delay_sec = 5,
std::vector< std::pair< std::string, homestore::HSDevType > > cust_dev_names = {}) {
Expand All @@ -191,6 +191,7 @@ class HSTestHelper {
std::this_thread::sleep_for(std::chrono::seconds{shutdown_delay_sec});
}

std::map< uint32_t, test_params > svc_params = std::move(svc_params_tmp);
std::vector< homestore::dev_info > device_info;
if (!cust_dev_names.empty() || SISL_OPTIONS.count("device_list")) {
if (cust_dev_names.empty())
Expand Down Expand Up @@ -291,6 +292,13 @@ class HSTestHelper {
hsi->with_repl_data_service(tp.repl_app, tp.custom_chunk_selector);
}
}
#ifdef _PRERELEASE
hsi->with_crash_simulator([=]() {
LOGINFO("CrashSimulator::crash() is called - restarting homestore");
start_homestore(test_name, svc_params, cb, true /* fake_restart */, false /* init_device */);
});
#endif

bool need_format =
hsi->start(hs_input_params{.devices = device_info, .app_mem_size = app_mem_size}, std::move(cb));

Expand Down

0 comments on commit 3748a90

Please sign in to comment.