Skip to content

Commit

Permalink
Merge branch 'trinity'
Browse files Browse the repository at this point in the history
* trinity:
  MB-58691: Overwrite passed cluster with result from testset
  MB-58691: Move atexit handler unregistration to cluster teardown
  MB-58691: Move cluster_test output interception to testlib config
  MB-61702: Add failover start test condition
  MB-61702: Handle majority of testconditions generically

Change-Id: I0d4ddc42000a03b4962acc5375e9c0d7f58b0406
  • Loading branch information
BenHuddleston committed May 8, 2024
2 parents e8b9a91 + 9e3aee6 commit f24bd7c
Show file tree
Hide file tree
Showing 14 changed files with 149 additions and 145 deletions.
17 changes: 4 additions & 13 deletions apps/ns_server/src/failover.erl
Expand Up @@ -172,6 +172,8 @@ orchestrate(Nodes, Options) when Nodes =/= [] ->
ale:info(?USER_LOGGER, "Starting failing over ~p", [Nodes]),
master_activity_events:note_failover(Nodes),

ok = testconditions:check_test_condition(failover_start),

Res =
case config_sync_and_orchestrate(Nodes, Options) of
{done, ErrorNodes, UnsafeNodes} ->
Expand Down Expand Up @@ -453,8 +455,8 @@ handle_buckets_failover(Nodes, PrepResults) ->

lists:map(
fun ({Bucket, _}) ->
ok = check_test_condition(
{fail_finalize_failover_at_bucket, Bucket})
ok = testconditions:check_test_condition(
{fail_finalize_failover_at_bucket, Bucket})
end, PrepResults),

Results.
Expand Down Expand Up @@ -1021,17 +1023,6 @@ get_failover_vbuckets(Snapshot, Node) ->
chronicle_compat:get(Snapshot, {node, Node, failover_vbuckets},
#{default => []}).

check_test_condition({Step, Bucket}) ->
case testconditions:get({Step, Bucket}) of
fail ->
?log_debug("Failing at step: ~p, Bucket: ~p due to test condition",
[Step, Bucket]),
testconditions:delete({Step, Bucket}),
fail_by_test_condition;
_ ->
ok
end.

-ifdef(TEST).

fix_vbucket_map_test_wrapper(Funs) ->
Expand Down
12 changes: 6 additions & 6 deletions apps/ns_server/src/hibernation_manager.erl
Expand Up @@ -106,7 +106,7 @@ pause_bucket(#bucket_hibernation_op_args{

{ok, _BucketConfig} = ns_bucket:remove_bucket(Bucket),

ok = hibernation_utils:check_test_condition(pause_after_node_ops_run),
ok = testconditions:check_test_condition(pause_after_node_ops_run),
kv_hibernation_agent:unprepare_pause_bucket(Bucket, KvNodes).

-spec pause_bucket_body(For, Args, Snapshot, Nodes) -> ok
Expand Down Expand Up @@ -186,7 +186,7 @@ resume_bucket(#bucket_hibernation_op_args{
%% status
ok = restore_bucket_in_resuming(Bucket, NewBucketConfig, Metadata),

ok = hibernation_utils:check_test_condition(resume_before_node_ops_run),
ok = testconditions:check_test_condition(resume_before_node_ops_run),

WorkersParams = build_workers_params(RemotePath, DesiredServers, Snapshot),
ok = hibernation_utils:run_hibernation_op(
Expand All @@ -198,7 +198,7 @@ resume_bucket(#bucket_hibernation_op_args{
ServerMapping, false, Nodes)
end, WorkersParams, ?RESUME_BUCKET_TIMEOUT),

ok = hibernation_utils:check_test_condition(resume_after_node_ops_run),
ok = testconditions:check_test_condition(resume_after_node_ops_run),

%% At this point the bucket will go live with the appropriate map and server
%% list
Expand All @@ -225,7 +225,7 @@ resume_bucket_body(For, Args, ServerMapping, DryRun, Nodes) ->

meck_base_modules() ->
[ns_cluster_membership, bucket_placer, hibernation_utils, ns_config,
ns_bucket, kv_hibernation_agent, service_manager].
ns_bucket, kv_hibernation_agent, service_manager, testconditions].

meck_expect_base() ->
meck:new(meck_base_modules(), [passthrough]),
Expand All @@ -252,7 +252,7 @@ meck_expect_base() ->
{version, ?VERSION_76}, {bucket_manifest, []},
{bucket_uuid, 1}]
end),
meck:expect(hibernation_utils, check_test_condition,
meck:expect(testconditions, check_test_condition,
fun (_) ->
ok
end),
Expand Down Expand Up @@ -486,7 +486,7 @@ force_unpause_via_process_failure_body(ProcessType) ->
Self ! unpause_issued,
ok
end),
meck:expect(hibernation_utils, check_test_condition,
meck:expect(testconditions, check_test_condition,
fun (_) ->
ok
end),
Expand Down
17 changes: 0 additions & 17 deletions apps/ns_server/src/hibernation_utils.erl
Expand Up @@ -32,7 +32,6 @@
get_data_remote_path/1,
get_node_data_remote_path/2,
get_bucket_data_remote_path/3,
check_test_condition/1,
log_hibernation_event/3]).

supported_services() ->
Expand Down Expand Up @@ -467,22 +466,6 @@ get_version_from_s3(Args) ->
end),
Version.

check_test_condition(undefined) ->
ok;
check_test_condition(Step) ->
case testconditions:get(Step) of
fail ->
?log_debug("Failing at step: ~p due to test condition", [Step]),
testconditions:delete(Step),
fail_by_test_condition;
{delay, Sleep} ->
?log_debug("Delaying step ~p by ~p ms", [Step, Sleep]),
testconditions:delete(Step),
timer:sleep(Sleep);
_ ->
ok
end.

get_event_id(initiated, pause_bucket) ->
pause_bucket_initiated;
get_event_id(initiated, resume_bucket) ->
Expand Down
4 changes: 2 additions & 2 deletions apps/ns_server/src/kv_hibernation_agent.erl
Expand Up @@ -307,7 +307,7 @@ do_pause_bucket(#bucket_hibernation_op_args{
[Bucket, LocalPath, RemotePath, BlobStorageRegion,
RateLimit / ?MIB]),

ok = hibernation_utils:check_test_condition(node_pause_before_data_sync),
ok = testconditions:check_test_condition(node_pause_before_data_sync),
ok = hibernation_utils:sync_s3(
Args#bucket_hibernation_op_args{remote_path = RemotePath}, LocalPath,
to),
Expand All @@ -332,7 +332,7 @@ do_resume_bucket(#bucket_hibernation_op_args{
[Bucket, RemotePath, LocalPath, BlobStorageRegion,
RateLimit / ?MIB]),

ok = hibernation_utils:check_test_condition(node_resume_before_data_sync),
ok = testconditions:check_test_condition(node_resume_before_data_sync),

ok = hibernation_utils:sync_s3(Args#bucket_hibernation_op_args{
remote_path = RemotePath}, LocalPath,
Expand Down
12 changes: 1 addition & 11 deletions apps/ns_server/src/ns_bucket.erl
Expand Up @@ -1326,7 +1326,7 @@ wait_for_bucket_shutdown(BucketName, Nodes0, Timeout) ->
LeftoverNodes0
end,

check_test_condition({wait_for_bucket_shutdown, BucketName}),
testconditions:check_test_condition({wait_for_bucket_shutdown, BucketName}),

case LeftoverNodes of
[] ->
Expand Down Expand Up @@ -2389,16 +2389,6 @@ update_desired_servers(DesiredServers, BucketConfig) ->
lists:keystore(desired_servers, 1, BucketConfig,
{desired_servers, DesiredServers}).

check_test_condition(Step) ->
case testconditions:get(Step) of
{delay, MSecs} = Val ->
?log_debug("Executing testcondition - ~p", [{Step, Val}]),
testconditions:delete(Step),
timer:sleep(MSecs);
_ ->
ok
end.

-spec get_expected_servers(proplists:proplist()) -> [node()].
%% Use this to get the list of servers that the bucket will be on after creation
get_expected_servers(BucketConfig) ->
Expand Down
2 changes: 1 addition & 1 deletion apps/ns_server/src/ns_orchestrator.erl
Expand Up @@ -1964,7 +1964,7 @@ handle_hibernation_manager_exit(normal, Bucket, Op) ->
ale:debug(?USER_LOGGER, "~p done for Bucket ~p.",
[Op, Bucket]),

ok = hibernation_utils:check_test_condition(
ok = testconditions:check_test_condition(
exit_ns_orchestrator_after_hibernation_op_done),

hibernation_utils:update_hibernation_status(completed),
Expand Down
61 changes: 11 additions & 50 deletions apps/ns_server/src/ns_rebalancer.erl
Expand Up @@ -1404,28 +1404,14 @@ check_test_condition(Step) ->
check_test_condition(Step, []).

check_test_condition(Step, Kind) ->
case testconditions:get(Step) of
fail ->
%% E.g. fail rebalance at the start.
%% Triggered by: testconditions:set(rebalance_start, fail)
trigger_failure(Step, []);
{delay, Sleep} ->
%% E.g. delay rebalance by 60s at the start.
%% Triggered by:
%% testconditions:set(rebalance_start, {delay, 60000})
trigger_delay(Step, [], Sleep);
{fail, Kind} ->
%% E.g. fail verify_replication for bucket "test".
%% Triggered by:
%% testconditions:set(verify_replication, {fail, “test”})
trigger_failure(Step, Kind);
{delay, Kind, Sleep} ->
%% E.g. delay service_rebalance_start by 1s for index service.
%% Triggered by:
%% testconditions:set(service_rebalance_start,
%% {delay, index, 1000})
trigger_delay(Step, Kind, Sleep);
{for_vb_move, Kind, N, Condition} ->
testconditions:check_test_condition(?REBALANCE_LOGGER, Step, Kind,
fun(Condition) ->
extended_check_test_condition(Condition, Step, Kind)
end).

extended_check_test_condition(Condition, Step, Kind) ->
case Condition of
{for_vb_move, Kind, N, Type} ->
%% Trigger the test condition for Nth vBucket move.
%% Note it is NOT vBucket #N, but rather the Nth vBucket
%% that is being moved. The actual vBucket # may be anything.
Expand All @@ -1441,42 +1427,17 @@ check_test_condition(Step, Kind) ->
%% Triggered by:
%% testconditions:set(backfill_done,
%% {for_vb_move, "test", 5, fail}).
trigger_condition_for_Nth_move(Step, Kind, N, Condition);
trigger_condition_for_Nth_move(Step, Kind, N, Type);
_ ->
ok
end.

trigger_failure(Step, Kind) ->
Msg = case Kind of
[] ->
io_lib:format("Failure triggered by test during ~p", [Step]);
_ ->
io_lib:format("Failure triggered by test during ~p for ~p",
[Step, Kind])
end,
?rebalance_error("~s", [lists:flatten(Msg)]),
testconditions:delete(Step),
fail_by_test_condition.

trigger_delay(Step, Kind, Sleep) ->
Msg = case Kind of
[] ->
io_lib:format("Delay triggered by test during ~p. "
"Sleeping for ~p ms", [Step, Sleep]);
_ ->
io_lib:format("Delay triggered by test during ~p for ~p. "
"Sleeping for ~p ms", [Step, Kind, Sleep])
end,
?rebalance_error("~s", [lists:flatten(Msg)]),
testconditions:delete(Step),
timer:sleep(Sleep).

trigger_condition_for_Nth_move(Step, Kind, 1, Condition) ->
case Condition of
fail ->
trigger_failure(Step, Kind);
testconditions:trigger_failure(?REBALANCE_LOGGER, Step, Kind);
{delay, Sleep} ->
trigger_delay(Step, Kind, Sleep)
testconditions:trigger_delay(?REBALANCE_LOGGER, Step, Kind, Sleep)
end;
trigger_condition_for_Nth_move(Step, Kind, N, Condition) ->
testconditions:set(Step, {for_vb_move, Kind, N - 1, Condition}).
Expand Down
4 changes: 2 additions & 2 deletions apps/ns_server/src/service_manager.erl
Expand Up @@ -346,7 +346,7 @@ pause_bucket_op(#state{service = Service,
service_manager = Manager},
{#bucket_hibernation_op_args{} = Args, _ExtraArgs},
Id, Leader, _NodesInfo) ->
ok = hibernation_utils:check_test_condition({pause_bucket, Service}),
ok = testconditions:check_test_condition({pause_bucket, Service}),
ok = service_agent:prepare_pause_bucket(Service, Nodes, Id, Args, Manager),
ok = service_agent:pause_bucket(Service, Leader, Id, Args, Manager).

Expand All @@ -356,7 +356,7 @@ resume_bucket_op(#state{service = Service,
{#bucket_hibernation_op_args{} = Args,
{DryRun, _ServerMapping}},
Id, Leader, _NodesInfo) ->
ok = hibernation_utils:check_test_condition({{resume_bucket, DryRun},
ok = testconditions:check_test_condition({{resume_bucket, DryRun},
Service}),
ok = service_agent:prepare_resume_bucket(
Service, Nodes, Id, Args, DryRun, Manager),
Expand Down
94 changes: 93 additions & 1 deletion apps/ns_server/src/testconditions.erl
Expand Up @@ -17,7 +17,13 @@
%% APIs
-export([get/1,
set/2,
delete/1]).
delete/1,
check_test_condition/1,
check_test_condition/2,
check_test_condition/3,
check_test_condition/4,
trigger_failure/3,
trigger_delay/4]).

get(Key) ->
simple_store:get(?TESTCONDITION_STORE, Key).
Expand All @@ -29,3 +35,89 @@ set(Key, Value) ->

delete(Key) ->
simple_store:delete(?TESTCONDITION_STORE, Key).

%%
%% Generic test condition handling:
%%
%% There are 2 types of generically handled test conditions:
%% 1. Applicable to a specific "step"
%% 2. Applicable to a specific "kind" of "step". "Kind" can be used as a
%% sub-tag for "step" for things like rebalance to inject a failure for a
%% specific bucket.
%%
%% There are two types of generically handled failures:
%% 1. Fail
%% 2. Delay. A delay can be used to inject other failures. E.g. Introduce a
%% delay of 60s during rebalance of a bucket. During those 60s, user can
%% SIGSTOP memcached on a node.
%%
%% An ExtendedHandler can be passed to allow custom handling of additional
%% conditions or failure types.
-spec check_test_condition(term()) -> term().
check_test_condition(Step) ->
check_test_condition(?NS_SERVER_LOGGER, Step, [], undefined).

-spec check_test_condition(atom(), term()) -> term().
check_test_condition(Logger, Step) ->
check_test_condition(Logger, Step, [], undefined).

-spec check_test_condition(atom(), term(), list()) -> term().
check_test_condition(Logger, Step, Kind) ->
check_test_condition(Logger, Step, Kind, undefined).

-spec check_test_condition(atom(), term(), list(),
undefined | fun((term()) -> term())) -> term().
check_test_condition(Logger, Step, Kind, ExtendedHandler) ->
case testconditions:get(Step) of
fail ->
%% E.g. fail rebalance at the start.
%% Triggered by: testconditions:set(rebalance_start, fail)
trigger_failure(Logger, Step, []);
{delay, Sleep} ->
%% E.g. delay rebalance by 60s at the start.
%% Triggered by:
%% testconditions:set(rebalance_start, {delay, 60000})
trigger_delay(Logger, Step, [], Sleep);
{fail, Kind} ->
%% E.g. fail verify_replication for bucket "test".
%% Triggered by:
%% testconditions:set(verify_replication, {fail, “test”})
trigger_failure(Logger, Step, Kind);
{delay, Kind, Sleep} ->
%% E.g. delay service_rebalance_start by 1s for index service.
%% Triggered by:
%% testconditions:set(service_rebalance_start,
%% {delay, index, 1000})
trigger_delay(Logger, Step, Kind, Sleep);
Condition ->
case ExtendedHandler of
undefined -> ok;
_ -> ExtendedHandler(Condition)
end
end.

trigger_failure(Logger, Step, Kind) ->
Msg = case Kind of
[] ->
io_lib:format("Failure triggered by test during ~p", [Step]);
_ ->
io_lib:format("Failure triggered by test during ~p for ~p",
[Step, Kind])
end,
ale:error(Logger, "~s", [lists:flatten(Msg)]),
testconditions:delete(Step),
fail_by_test_condition.

trigger_delay(Logger, Step, Kind, Sleep) ->
Msg = case Kind of
[] ->
io_lib:format("Delay triggered by test during ~p. "
"Sleeping for ~p ms", [Step, Sleep]);
_ ->
io_lib:format("Delay triggered by test during ~p for ~p. "
"Sleeping for ~p ms", [Step, Kind, Sleep])
end,
ale:error(Logger, "~s", [lists:flatten(Msg)]),

testconditions:delete(Step),
timer:sleep(Sleep).
2 changes: 1 addition & 1 deletion apps/ns_server/test/failover_tests.erl
Expand Up @@ -103,7 +103,7 @@ manual_failover_test_setup(SetupConfig) ->
meck:new(chronicle),
meck:expect(chronicle, check_quorum, fun() -> true end),

meck:new(testconditions),
meck:new(testconditions, [passthrough]),
meck:expect(testconditions, get, fun(_) -> ok end),

meck:new(chronicle_master, [passthrough]),
Expand Down

0 comments on commit f24bd7c

Please sign in to comment.