Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dns: respecting DNS TTL #18408

Merged
merged 10 commits into from
Oct 14, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -55,16 +55,12 @@ message DnsCacheConfig {
config.cluster.v3.Cluster.DnsLookupFamily dns_lookup_family = 2
[(validate.rules).enum = {defined_only: true}];

// The DNS refresh rate for currently cached DNS hosts. If not specified defaults to 60s.
//
// .. note:
//
// The returned DNS TTL is not currently used to alter the refresh rate. This feature will be
// added in a future change.
//
// .. note:
// The DNS refresh rate for unresolved DNS hosts. If not specified defaults to 60s.
//
// The refresh rate is rounded to the closest millisecond, and must be at least 1ms.
//
// Once a host has been resolved, the refresh rate will be the DNS TTL, capped
// at a minimum of 5s.
htuch marked this conversation as resolved.
Show resolved Hide resolved
google.protobuf.Duration dns_refresh_rate = 3
[(validate.rules).duration = {gte {nanos: 1000000}}];

Expand Down
1 change: 1 addition & 0 deletions docs/root/version_history/current.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Minor Behavior Changes
*Changes that may cause incompatibilities for some users, but should not for most*

* config: the log message for "gRPC config stream closed" now uses the most recent error message, and reports seconds instead of milliseconds for how long the most recent status has been received.
* dns: now respecting the returned DNS TTL for resolved hosts, rather than always relying on the hard-coded :ref:`dns_refresh_rate. <envoy_v3_api_field_config.cluster.v3.Cluster.dns_refresh_rate>` This behavior can be temporarily reverted by setting the runtime guard ``envoy.reloadable_features.use_dns_ttl`` to false.

Bug Fixes
---------
Expand Down
6 changes: 6 additions & 0 deletions envoy/common/backoff_strategy.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@ class BackOffStrategy {
* Resets the intervals so that the back off intervals can start again.
*/
virtual void reset() PURE;

/**
* Resets the interval with a (potentially) new starting point.
* @param base_interval the new base interval for the backoff strategy.
*/
virtual void reset(uint64_t base_interval) PURE;
};

using BackOffStrategyPtr = std::unique_ptr<BackOffStrategy>;
Expand Down
12 changes: 9 additions & 3 deletions source/common/common/backoff_strategy.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,13 @@ class JitteredExponentialBackOffStrategy : public BackOffStrategy {
// BackOffStrategy methods
uint64_t nextBackOffMs() override;
void reset() override;
void reset(uint64_t base_interval) override {
base_interval_ = base_interval;
reset();
}

private:
const uint64_t base_interval_;
uint64_t base_interval_;
const uint64_t max_interval_{};
uint64_t next_interval_;
Random::RandomGenerator& random_;
Expand All @@ -53,9 +57,10 @@ class JitteredLowerBoundBackOffStrategy : public BackOffStrategy {
// BackOffStrategy methods
uint64_t nextBackOffMs() override;
void reset() override {}
void reset(uint64_t min_interval) override { min_interval_ = min_interval; }

private:
const uint64_t min_interval_;
uint64_t min_interval_;
Random::RandomGenerator& random_;
};

Expand All @@ -74,9 +79,10 @@ class FixedBackOffStrategy : public BackOffStrategy {
// BackOffStrategy methods.
uint64_t nextBackOffMs() override;
void reset() override {}
void reset(uint64_t interval_ms) override { interval_ms_ = interval_ms; }

private:
const uint64_t interval_ms_;
uint64_t interval_ms_;
};

} // namespace Envoy
1 change: 1 addition & 0 deletions source/common/runtime/runtime_features.cc
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ constexpr const char* runtime_features[] = {
"envoy.reloadable_features.udp_listener_updates_filter_chain_in_place",
"envoy.reloadable_features.udp_per_event_loop_read_limit",
"envoy.reloadable_features.unquote_log_string_values",
"envoy.reloadable_features.use_dns_ttl",
"envoy.reloadable_features.use_observable_cluster_name",
"envoy.reloadable_features.validate_connect",
"envoy.reloadable_features.vhds_heartbeats",
Expand Down
41 changes: 22 additions & 19 deletions source/extensions/common/dynamic_forward_proxy/dns_cache_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ namespace DynamicForwardProxy {
DnsCacheImpl::DnsCacheImpl(
Server::Configuration::FactoryContextBase& context,
const envoy::extensions::common::dynamic_forward_proxy::v3::DnsCacheConfig& config)
: main_thread_dispatcher_(context.mainThreadDispatcher()),
: main_thread_dispatcher_(context.mainThreadDispatcher()), config_(config),
random_generator_(context.api().randomGenerator()),
dns_lookup_family_(DnsUtils::getDnsLookupFamilyFromEnum(config.dns_lookup_family())),
resolver_(selectDnsResolver(config, main_thread_dispatcher_)),
tls_slot_(context.threadLocal()),
Expand All @@ -27,10 +28,6 @@ DnsCacheImpl::DnsCacheImpl(
config.dns_cache_circuit_breaker()),
refresh_interval_(PROTOBUF_GET_MS_OR_DEFAULT(config, dns_refresh_rate, 60000)),
timeout_interval_(PROTOBUF_GET_MS_OR_DEFAULT(config, dns_query_timeout, 5000)),
failure_backoff_strategy_(
Config::Utility::prepareDnsRefreshStrategy<
envoy::extensions::common::dynamic_forward_proxy::v3::DnsCacheConfig>(
config, refresh_interval_.count(), context.api().randomGenerator())),
file_system_(context.api().fileSystem()),
validation_visitor_(context.messageValidationVisitor()),
host_ttl_(PROTOBUF_GET_MS_OR_DEFAULT(config, host_ttl, 300000)),
Expand Down Expand Up @@ -245,8 +242,8 @@ void DnsCacheImpl::onReResolve(const std::string& host) {
const std::chrono::steady_clock::duration now_duration =
main_thread_dispatcher_.timeSource().monotonicTime().time_since_epoch();
auto last_used_time = primary_host.host_info_->lastUsedTime();
ENVOY_LOG(debug, "host='{}' TTL check: now={} last_used={}", host, now_duration.count(),
last_used_time.count());
ENVOY_LOG(debug, "host='{}' TTL check: now={} last_used={} TTL {}", host, now_duration.count(),
last_used_time.count(), host_ttl_.count());
if ((now_duration - last_used_time) > host_ttl_) {
ENVOY_LOG(debug, "host='{}' TTL expired, removing", host);
// If the host has no address then that means that the DnsCacheImpl has never
Expand Down Expand Up @@ -365,34 +362,36 @@ void DnsCacheImpl::finishResolve(const std::string& host,
if (!resolution_time.has_value()) {
resolution_time = main_thread_dispatcher_.timeSource().monotonicTime();
}
std::chrono::seconds dns_ttl =
std::chrono::duration_cast<std::chrono::seconds>(refresh_interval_);
if (new_address) {
// Update the cache entry and staleness any time the ttl changes.
if (!from_cache) {
addCacheEntry(host, new_address, response.front().ttl_);
}
primary_host_info->host_info_->updateStale(resolution_time.value(), response.front().ttl_);
if (Runtime::runtimeFeatureEnabled("envoy.reloadable_features.use_dns_ttl")) {
// Arbitrarily cap DNS re-resolution at 5s to avoid constant DNS queries.
dns_ttl = std::max<std::chrono::seconds>(std::chrono::seconds(5), response.front().ttl_);
}
primary_host_info->host_info_->updateStale(resolution_time.value(), dns_ttl);
}

if (first_resolve) {
primary_host_info->host_info_->setFirstResolveComplete();
}
if (first_resolve || address_changed) {
// TODO(alyssawilk) only notify threads of stale results after a resolution
// timeout.
if (first_resolve || (address_changed && !primary_host_info->host_info_->isStale())) {
notifyThreads(host, primary_host_info->host_info_);
}

// Kick off the refresh timer.
// TODO(mattklein123): Consider jitter here. It may not be necessary since the initial host
// is populated dynamically.
// TODO(alyssawilk) also consider TTL here.
if (status == Network::DnsResolver::ResolutionStatus::Success) {
failure_backoff_strategy_->reset();
primary_host_info->refresh_timer_->enableTimer(refresh_interval_);
primary_host_info->failure_backoff_strategy_->reset(
std::chrono::duration_cast<std::chrono::milliseconds>(dns_ttl).count());
primary_host_info->refresh_timer_->enableTimer(dns_ttl);
ENVOY_LOG(debug, "DNS refresh rate reset for host '{}', refresh rate {} ms", host,
refresh_interval_.count());
dns_ttl.count() * 1000);
} else {
const uint64_t refresh_interval = failure_backoff_strategy_->nextBackOffMs();
const uint64_t refresh_interval = primary_host_info->failure_backoff_strategy_->nextBackOffMs();
primary_host_info->refresh_timer_->enableTimer(std::chrono::milliseconds(refresh_interval));
ENVOY_LOG(debug, "DNS refresh rate reset for host '{}', (failure) refresh rate {} ms", host,
refresh_interval);
Expand Down Expand Up @@ -451,7 +450,11 @@ DnsCacheImpl::PrimaryHostInfo::PrimaryHostInfo(DnsCacheImpl& parent,
refresh_timer_(parent.main_thread_dispatcher_.createTimer(refresh_timer_cb)),
timeout_timer_(parent.main_thread_dispatcher_.createTimer(timeout_timer_cb)),
host_info_(std::make_shared<DnsHostInfoImpl>(parent.main_thread_dispatcher_.timeSource(),
host_to_resolve, is_ip_address)) {
host_to_resolve, is_ip_address)),
failure_backoff_strategy_(
Config::Utility::prepareDnsRefreshStrategy<
envoy::extensions::common::dynamic_forward_proxy::v3::DnsCacheConfig>(
parent_.config_, parent_.refresh_interval_.count(), parent_.random_generator_)) {
parent_.stats_.host_added_.inc();
parent_.stats_.num_hosts_.inc();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,9 @@ class DnsCacheImpl : public DnsCache, Logger::Loggable<Logger::Id::forward_proxy
void updateStale(MonotonicTime resolution_time, std::chrono::seconds ttl) {
stale_at_time_ = resolution_time + ttl;
}
bool isStale() {
return time_source_.monotonicTime() > static_cast<MonotonicTime>(stale_at_time_);
}

void setAddress(Network::Address::InstanceConstSharedPtr address) {
absl::WriterMutexLock lock{&resolve_lock_};
Expand Down Expand Up @@ -160,6 +163,7 @@ class DnsCacheImpl : public DnsCache, Logger::Loggable<Logger::Id::forward_proxy
const Event::TimerPtr refresh_timer_;
const Event::TimerPtr timeout_timer_;
const DnsHostInfoImplSharedPtr host_info_;
const BackOffStrategyPtr failure_backoff_strategy_;
Network::ActiveDnsQuery* active_query_{};
};

Expand Down Expand Up @@ -199,6 +203,8 @@ class DnsCacheImpl : public DnsCache, Logger::Loggable<Logger::Id::forward_proxy
PrimaryHostInfo* createHost(const std::string& host, uint16_t default_port);

Event::Dispatcher& main_thread_dispatcher_;
const envoy::extensions::common::dynamic_forward_proxy::v3::DnsCacheConfig config_;
Random::RandomGenerator& random_generator_;
const Network::DnsLookupFamily dns_lookup_family_;
const Network::DnsResolverSharedPtr resolver_;
ThreadLocal::TypedSlot<ThreadLocalHostInfo> tls_slot_;
Expand All @@ -212,7 +218,6 @@ class DnsCacheImpl : public DnsCache, Logger::Loggable<Logger::Id::forward_proxy
DnsCacheResourceManagerImpl resource_manager_;
const std::chrono::milliseconds refresh_interval_;
const std::chrono::milliseconds timeout_interval_;
const BackOffStrategyPtr failure_backoff_strategy_;
Filesystem::Instance& file_system_;
ProtobufMessage::ValidationVisitor& validation_visitor_;
const std::chrono::milliseconds host_ttl_;
Expand Down
12 changes: 11 additions & 1 deletion test/common/common/backoff_strategy_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,15 @@ TEST(ExponentialBackOffStrategyTest, JitteredBackOffBasicReset) {
ON_CALL(random, random()).WillByDefault(Return(27));

JitteredExponentialBackOffStrategy jittered_back_off(25, 30, random);
EXPECT_EQ(2, jittered_back_off.nextBackOffMs());
EXPECT_EQ(2, jittered_back_off.nextBackOffMs()); // 25 % 27
EXPECT_EQ(27, jittered_back_off.nextBackOffMs());

jittered_back_off.reset();
EXPECT_EQ(2, jittered_back_off.nextBackOffMs()); // Should start from start
EXPECT_EQ(27, jittered_back_off.nextBackOffMs());

jittered_back_off.reset(26);
EXPECT_EQ(1, jittered_back_off.nextBackOffMs()); // 26 % 27
}

TEST(ExponentialBackOffStrategyTest, JitteredBackOffDoesntOverflow) {
Expand Down Expand Up @@ -77,6 +81,9 @@ TEST(ExponentialBackOffStrategyTest, JitteredBackOffWithMaxIntervalReset) {
EXPECT_EQ(79, jittered_back_off.nextBackOffMs());
EXPECT_EQ(99, jittered_back_off.nextBackOffMs()); // Should return Max here
EXPECT_EQ(99, jittered_back_off.nextBackOffMs());

jittered_back_off.reset(4);
EXPECT_EQ(3, jittered_back_off.nextBackOffMs());
}

TEST(LowerBoundBackOffStrategyTest, JitteredBackOffWithLowRandomValue) {
Expand All @@ -102,6 +109,9 @@ TEST(FixedBackOffStrategyTest, FixedBackOffBasicReset) {

fixed_back_off.reset();
EXPECT_EQ(30, fixed_back_off.nextBackOffMs());

fixed_back_off.reset(20);
EXPECT_EQ(20, fixed_back_off.nextBackOffMs());
}

} // namespace Envoy
Loading