Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

maglev: fix maglev stability by sorting host_weights beforehand #28055

Merged
merged 4 commits into from Jun 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions changelogs/current.yaml
Expand Up @@ -186,6 +186,9 @@ bug_fixes:
change: |
Fixes an issue with the ORIGINAL_DST cluster cleanup timer lifetime, which
can occur if the cluster is removed while the timer is armed.
- area: maglev loadbalancer
change: |
Fixes maglev stability problem. Previously, maglev returns slightly different backend assignment from the same backends and keys.

removed_config_or_runtime:
# *Normally occurs at the end of the* :ref:`deprecation period <deprecated>`
Expand Down
24 changes: 20 additions & 4 deletions source/extensions/load_balancing_policies/maglev/maglev_lb.cc
Expand Up @@ -81,16 +81,32 @@ void MaglevTable::constructMaglevTableInternal(
return;
}

// Implementation of pseudocode listing 1 in the paper (see header file for more info).
std::vector<TableBuildEntry> table_build_entries;
table_build_entries.reserve(normalized_host_weights.size());
// Prepare stable (sorted) vector of host_weight.
// Maglev requires stable order of table_build_entries because the hash table will be filled in
// the order. Unstable table_build_entries results the change of backend assignment.
std::vector<std::tuple<absl::string_view, HostConstSharedPtr, double>> sorted_host_weights;
sorted_host_weights.reserve(normalized_host_weights.size());
for (const auto& host_weight : normalized_host_weights) {
const auto& host = host_weight.first;
const absl::string_view key_to_hash = hashKey(host, use_hostname_for_hashing);
ASSERT(!key_to_hash.empty());

sorted_host_weights.emplace_back(key_to_hash, host_weight.first, host_weight.second);
}

std::sort(sorted_host_weights.begin(), sorted_host_weights.end());

// Implementation of pseudocode listing 1 in the paper (see header file for more info).
std::vector<TableBuildEntry> table_build_entries;
table_build_entries.reserve(normalized_host_weights.size());
for (const auto& sorted_host_weight : sorted_host_weights) {
const auto& key_to_hash = std::get<0>(sorted_host_weight);
const auto& host = std::get<1>(sorted_host_weight);
const auto& weight = std::get<2>(sorted_host_weight);

table_build_entries.emplace_back(host, HashUtil::xxHash64(key_to_hash) % table_size_,
(HashUtil::xxHash64(key_to_hash, 1) % (table_size_ - 1)) + 1,
host_weight.second);
weight);
}

constructImplementationInternals(table_build_entries, max_normalized_weight);
Expand Down
Expand Up @@ -285,6 +285,51 @@ TEST_P(MaglevLoadBalancerTest, BasicWithRetryHostPredicate) {
}
}

// Basic stability test.
TEST_P(MaglevLoadBalancerTest, BasicStability) {
host_set_.hosts_ = {makeTestHost(info_, "tcp://127.0.0.1:90", simTime()),
makeTestHost(info_, "tcp://127.0.0.1:91", simTime()),
makeTestHost(info_, "tcp://127.0.0.1:92", simTime()),
makeTestHost(info_, "tcp://127.0.0.1:93", simTime()),
makeTestHost(info_, "tcp://127.0.0.1:94", simTime()),
makeTestHost(info_, "tcp://127.0.0.1:95", simTime())};
host_set_.healthy_hosts_ = host_set_.hosts_;
host_set_.runCallbacks({}, {});
init(7);

EXPECT_EQ("maglev_lb.min_entries_per_host", lb_->stats().min_entries_per_host_.name());
EXPECT_EQ("maglev_lb.max_entries_per_host", lb_->stats().max_entries_per_host_.name());
EXPECT_EQ(1, lb_->stats().min_entries_per_host_.value());
EXPECT_EQ(2, lb_->stats().max_entries_per_host_.value());

// maglev: i=0 host=127.0.0.1:92
// maglev: i=1 host=127.0.0.1:94
// maglev: i=2 host=127.0.0.1:90
// maglev: i=3 host=127.0.0.1:91
// maglev: i=4 host=127.0.0.1:95
// maglev: i=5 host=127.0.0.1:90
// maglev: i=6 host=127.0.0.1:93
LoadBalancerPtr lb = lb_->factory()->create(lb_params_);
const std::vector<uint32_t> expected_assignments{2, 4, 0, 1, 5, 0, 3};
for (uint32_t i = 0; i < 3 * expected_assignments.size(); ++i) {
TestLoadBalancerContext context(i);
EXPECT_EQ(host_set_.hosts_[expected_assignments[i % expected_assignments.size()]],
lb->chooseHost(&context));
}

// Shuffle healthy_hosts_ to check stability of assignments
std::shuffle(host_set_.healthy_hosts_.begin(), host_set_.healthy_hosts_.end(),
std::default_random_engine());
host_set_.runCallbacks({}, {});
lb = lb_->factory()->create(lb_params_);

for (uint32_t i = 0; i < 3 * expected_assignments.size(); ++i) {
TestLoadBalancerContext context(i);
EXPECT_EQ(host_set_.hosts_[expected_assignments[i % expected_assignments.size()]],
lb->chooseHost(&context));
}
}

// Weighted sanity test.
TEST_P(MaglevLoadBalancerTest, Weighted) {
host_set_.hosts_ = {makeTestHost(info_, "tcp://127.0.0.1:90", simTime(), 1),
Expand Down