service: raft: force initial snapshot transfer in new cluster

When we upgrade a cluster to use Raft, or perform manual Raft recovery procedure (which also creates a fresh group 0 cluster, using the same algorithm as during upgrade), we start with a non-empty group 0 state machine; in particular, the schema tables are non-empty. In this case we need to ensure that nodes which join group 0 receive the group 0 state. Right now this is not the case. In previous releases, where group 0 consisted only of schema, and schema pulls were also done outside Raft, those nodes received schema through this outside mechanism. In 91f609d we disabled schema pulls outside Raft; we're also extending group 0 with other things, like topology-specific state. To solve this, we force snapshot transfers by setting the initial snapshot index on the first group 0 server to `1` instead of `0`. During replication, Raft will see that the joining servers are behind, triggering snapshot transfer and forcing them to pull group 0 state. It's unnecessary to do this for cluster which bootstraps with Raft enabled right away but it also doesn't hurt, so we keep the logic simple and don't introduce branches based on that. Extend Raft upgrade tests with a node bootstrap step at the end to prevent regressions (without this patch, the step would hang - node would never join, waiting for schema). Fixes: scylladb#14066 Closes scylladb#14336
dgarcia360 · Jun 29, 2023 · ff386e7 · ff386e7
1 parent 3d81408
commit ff386e7
Show file tree

Hide file tree

Showing 6 changed files with 42 additions and 11 deletions.
diff --git a/raft/raft.hh b/raft/raft.hh
@@ -750,6 +750,18 @@ public:
     // apply call 'state_machine::load_snapshot(snapshot::id)'
     // Called during Raft server initialization only, should not
     // run in parallel with store.
+    //
+    // If you want to create a Raft cluster with a non-empty state
+    // machine, so that joining servers always receive a snapshot,
+    // you should:
+    // - make sure that members of the initial configuration have
+    //   the same state machine state,
+    // - set the initial snapshot index on members of the initial
+    //   configuration to 1,
+    // - set the initial snapshot index on all subsequently joining
+    //   servers to 0.
+    // This also works if you start with an empty state machine,
+    // so consider it as the go-to default.
     virtual future<snapshot_descriptor> load_snapshot_descriptor() = 0;
 
     // Persist given log entries.

diff --git a/service/raft/raft_group0.cc b/service/raft/raft_group0.cc
@@ -420,14 +420,22 @@ future<> raft_group0::join_group0(std::vector<gms::inet_address> seeds, bool as_
         if (server == nullptr) {
             // This is the first time discovery is run. Create and start a Raft server for group 0 on this node.
             raft::configuration initial_configuration;
+            bool nontrivial_snapshot = false;
             if (g0_info.id == my_id) {
                 // We were chosen as the discovery leader.
                 // We should start a new group with this node as voter.
                 group0_log.info("Server {} chosen as discovery leader; bootstrapping group 0 from scratch", my_id);
                 initial_configuration.current.emplace(my_addr, true);
+                // Force snapshot transfer from us to subsequently joining servers.
+                // This is important for upgrade and recovery, where the group 0 state machine
+                // (schema tables in particular) is nonempty.
+                // In a fresh cluster this will trigger an empty snapshot transfer which is redundant but correct.
+                // See #14066.
+                nontrivial_snapshot = true;
             }
             // Bootstrap the initial configuration
-            co_await raft_sys_table_storage(qp, group0_id, my_id).bootstrap(std::move(initial_configuration));
+            co_await raft_sys_table_storage(qp, group0_id, my_id)
+                    .bootstrap(std::move(initial_configuration), nontrivial_snapshot);
             co_await start_server_for_group0(group0_id, ss, qp, mm, cdc_gen_service);
             server = &_raft_gr.group0();
             // FIXME if we crash now or after getting added to the config but before storing group 0 ID,

diff --git a/service/raft/raft_sys_table_storage.cc b/service/raft/raft_sys_table_storage.cc
@@ -323,8 +323,9 @@ future<> raft_sys_table_storage::execute_with_linearization_point(std::function<
     }
 }
 
-future<> raft_sys_table_storage::bootstrap(raft::configuration initial_configuation) {
-    raft::snapshot_descriptor snapshot;
+future<> raft_sys_table_storage::bootstrap(raft::configuration initial_configuation, bool nontrivial_snapshot) {
+    auto init_index = nontrivial_snapshot ? raft::index_t{1} : raft::index_t{0};
+    raft::snapshot_descriptor snapshot{.idx{init_index}};
     snapshot.id = raft::snapshot_id::create_random_id();
     snapshot.config = std::move(initial_configuation);
     co_await store_snapshot_descriptor(snapshot, 0);

diff --git a/service/raft/raft_sys_table_storage.hh b/service/raft/raft_sys_table_storage.hh
@@ -73,9 +73,14 @@ public:
 
     // Persist initial configuration of a new Raft group.
     // To be called before start for the new group.
-    // Uses a special snapshot id (0) to identify the snapshot
-    // descriptor.
-    future<> bootstrap(raft::configuration initial_configuation);
+    //
+    // If `nontrivial_snapshot` is true, the initial snapshot will have index 1 instead of 0,
+    // which will trigger a snapshot transfer to servers which start with snapshot index 0.
+    // This should be set for the first group 0 server during upgrade or recovery, which
+    // will force snapshot transfers for subsequently joining nodes (so we can transfer initial
+    // schema etc.). It's also correct to do it when booting a cluster from
+    // scratch with Raft, although not necessary (it will force an empty snapshot transfer).
+    future<> bootstrap(raft::configuration initial_configuation, bool nontrivial_snapshot);
 private:
 
     future<size_t> do_store_log_entries_one_batch(const std::vector<raft::log_entry_ptr>& entries, size_t start_idx);

diff --git a/test/topology_raft_disabled/test_raft_upgrade_basic.py b/test/topology_raft_disabled/test_raft_upgrade_basic.py
@@ -24,9 +24,6 @@
 @log_run_time
 @pytest.mark.replication_factor(1)
 async def test_raft_upgrade_basic(manager: ManagerClient, random_tables: RandomTables):
-    """
-    kbr-: the test takes about 7 seconds in dev mode on my laptop.
-    """
     servers = await manager.running_servers()
     cql = manager.cql
     assert(cql)
@@ -53,3 +50,8 @@ async def test_raft_upgrade_basic(manager: ManagerClient, random_tables: RandomT
     assert(rs)
     logging.info(f"group0_history entry description: '{rs[0].description}'")
     assert(table.full_name in rs[0].description)
+
+    logging.info("Booting new node")
+    await manager.server_add(config={
+        'consistent_cluster_management': True
+    })
diff --git a/test/topology_raft_disabled/test_raft_upgrade_majority_loss.py b/test/topology_raft_disabled/test_raft_upgrade_majority_loss.py
@@ -30,8 +30,6 @@ async def test_recovery_after_majority_loss(manager: ManagerClient, random_table
     used to recover group 0 might have missed them. However in this test the driver waits
     for schema agreement to complete before proceeding, so we know that every server learned
     about the schema changes.
-
-    kbr-: the test takes about 22 seconds in dev mode on my laptop.
     """
     servers = await manager.running_servers()
 
@@ -85,3 +83,8 @@ async def test_recovery_after_majority_loss(manager: ManagerClient, random_table
 
     logging.info("Creating another table")
     await random_tables.add_table(ncolumns=5)
+
+    logging.info("Booting new node")
+    await manager.server_add(config={
+        'consistent_cluster_management': True
+    })