Skip to content

Commit

Permalink
MB-42737: Don't terminate memcached when reconfig of prometheus fails
Browse files Browse the repository at this point in the history
Terminate memcached if we fail to open the prometheus port during
startup, but if some error happens during reconfigure we should
just log it instead of terminating the process.

The current reconfigure scheme of interfaces is risky as we may
not be able to open up ports. This is all addressed in MB-39620
where we provide commands to define/delete interfaces so that
the calling process knows which ports we listen on.

Killing memcached for a reconfigure failure will cause data loss
of all items not persisted to disk/replicated and make memcached
enter a warmup phase. ns_server may work around the problem today
by trying to reload the configuration and we'll retry binding to
the port. If that continues to fail it should move to a different
port (or even better: use an ephemeral port)

Change-Id: If29159d94f06f96d132e08de16eee6cfb73d4b00
Reviewed-on: http://review.couchbase.org/c/kv_engine/+/142971
Reviewed-by: James Harrison <james.harrison@couchbase.com>
Reviewed-by: Dave Rigby <daver@couchbase.com>
Tested-by: Build Bot <build@couchbase.com>
  • Loading branch information
trondn committed Jan 11, 2021
1 parent e388643 commit 19f7e19
Show file tree
Hide file tree
Showing 3 changed files with 83 additions and 21 deletions.
29 changes: 21 additions & 8 deletions daemon/memcached.cc
Original file line number Diff line number Diff line change
Expand Up @@ -254,21 +254,34 @@ static bool prometheus_auth_callback(const std::string& user,
}

static void prometheus_changed_listener(const std::string&, Settings& s) {
cb::prometheus::initialize(s.getPrometheusConfig(),
server_prometheus_stats,
prometheus_auth_callback);
if (networkInterfaceManager) {
networkInterfaceManager->signal();
try {
cb::prometheus::initialize(s.getPrometheusConfig(),
server_prometheus_stats,
prometheus_auth_callback);
if (networkInterfaceManager) {
networkInterfaceManager->signal();
}
} catch (const std::exception& exception) {
// Error message already formatted. Just log the failure but don't
// terminate memcached... ns_server could always try to store
// a new configuration and we'll try again (once we move over to
// ifconfig they will know when it fails)!
LOG_CRITICAL("{}", exception.what());
}
}

static void prometheus_init() {
auto& settings = Settings::instance();

if (Settings::instance().has.prometheus_config) {
cb::prometheus::initialize(settings.getPrometheusConfig(),
server_prometheus_stats,
prometheus_auth_callback);
try {
cb::prometheus::initialize(settings.getPrometheusConfig(),
server_prometheus_stats,
prometheus_auth_callback);
} catch (const std::exception& exception) {
// Error message already formatted in the exception
FATAL_ERROR(EXIT_FAILURE, "{}", exception.what());
}
} else {
LOG_WARNING("Prometheus config not specified");
}
Expand Down
23 changes: 20 additions & 3 deletions include/statistics/prometheus.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,17 +40,31 @@ using AuthCallback =
using GetStatsCallback =
std::function<ENGINE_ERROR_CODE(const StatCollector&, Cardinality)>;

/**
* Initialize the prometheus exporter
*
* @param config the port number and address family to bind to (specifying
* 0 as the port number will use an ephemeral port)
* @param getStatsCB The callback function to call to retrieve the statistic
* @param authCB The callback to use for authentication for the requests
* @return The current configuration
* @throws std::bad_alloc for memory allocation failures
* @throws std::runtime_errors if we failed to start the exporter service
*/
STATISTICS_PUBLIC_API
void initialize(const std::pair<in_port_t, sa_family_t>& config,
GetStatsCallback getStatsCB,
AuthCallback authCB);
nlohmann::json initialize(const std::pair<in_port_t, sa_family_t>& config,
GetStatsCallback getStatsCB,
AuthCallback authCB);

STATISTICS_PUBLIC_API
void shutdown();

STATISTICS_PUBLIC_API
std::pair<in_port_t, sa_family_t> getRunningConfig();

STATISTICS_PUBLIC_API
nlohmann::json getRunningConfigAsJson();

/**
* Global manager for exposing stats for Prometheus.
*
Expand Down Expand Up @@ -98,6 +112,8 @@ class MetricServer {

[[nodiscard]] std::pair<in_port_t, sa_family_t> getRunningConfig() const;

[[nodiscard]] nlohmann::json getRunningConfigAsJson() const;

private:
class KVCollectable;

Expand All @@ -116,5 +132,6 @@ class MetricServer {
static const std::string authRealm;

const sa_family_t family;
const std::string uuid;
};
} // namespace cb::prometheus
52 changes: 42 additions & 10 deletions statistics/prometheus.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
#include <daemon/stats.h>
#include <folly/SynchronizedPtr.h>
#include <logger/logger.h>
#include <nlohmann/json.hpp>
#include <platform/uuid.h>
#include <prometheus/exposer.h>
#include <gsl/gsl>
#include <utility>
Expand All @@ -35,9 +37,9 @@ const std::string MetricServer::authRealm = "KV Prometheus Exporter";

folly::SynchronizedPtr<std::unique_ptr<MetricServer>> instance;

void initialize(const std::pair<in_port_t, sa_family_t>& config,
GetStatsCallback getStatsCB,
AuthCallback authCB) {
nlohmann::json initialize(const std::pair<in_port_t, sa_family_t>& config,
GetStatsCallback getStatsCB,
AuthCallback authCB) {
auto handle = instance.wlockPointer();
// May be called at init or on config change.
// If an instance already exists, destroy it before creating
Expand All @@ -48,11 +50,13 @@ void initialize(const std::pair<in_port_t, sa_family_t>& config,
*handle = std::make_unique<MetricServer>(
port, family, std::move(getStatsCB), std::move(authCB));
if (!(*handle)->isAlive()) {
FATAL_ERROR(EXIT_FAILURE,
fmt::format("Failed to start Prometheus exposer on "
"family:{} port:{}",
(family == AF_INET) ? "inet" : "inet6",
port));
handle->reset();
auto message = fmt::format(
"Failed to start Prometheus "
"exposer on family:{} port:{}",
(family == AF_INET) ? "inet" : "inet6",
port);
throw std::runtime_error(message);
}

// if the configured port is set to 0, an available port number will have
Expand All @@ -61,6 +65,7 @@ void initialize(const std::pair<in_port_t, sa_family_t>& config,
LOG_INFO("Prometheus Exporter started, listening on family:{} port:{}",
(family == AF_INET) ? "inet" : "inet6",
listeningPort);
return (*handle)->getRunningConfigAsJson();
}

void shutdown() {
Expand All @@ -76,6 +81,15 @@ std::pair<in_port_t, sa_family_t> getRunningConfig() {
return handle->getRunningConfig();
}

nlohmann::json getRunningConfigAsJson() {
auto handle = instance.rlock();
if (!handle || !handle->isAlive()) {
// no MetricServer, or it is not listening
return {};
}
return handle->getRunningConfigAsJson();
}

class MetricServer::KVCollectable : public ::prometheus::Collectable {
public:
KVCollectable(Cardinality cardinality, GetStatsCallback getStatsCB)
Expand Down Expand Up @@ -119,7 +133,8 @@ MetricServer::MetricServer(in_port_t port,
AuthCallback authCB)
: stats(std::make_shared<KVCollectable>(Cardinality::Low, getStatsCB)),
statsHC(std::make_shared<KVCollectable>(Cardinality::High, getStatsCB)),
family(family) {
family(family),
uuid(::to_string(cb::uuid::random())) {
try {
/*
* The connectionStr should meet the spec for civetweb's
Expand All @@ -145,7 +160,9 @@ MetricServer::MetricServer(in_port_t port,
exposer->RegisterAuth(authCB, authRealm, lowCardinalityPath);
exposer->RegisterAuth(authCB, authRealm, highCardinalityPath);
} catch (const std::exception& e) {
LOG_ERROR("Failed start Prometheus Exposer: {}", e.what());
LOG_ERROR("Failed to start Prometheus Exposer: {}", e.what());
// Kill a partially initialized object
exposer.reset();
}
}

Expand Down Expand Up @@ -174,4 +191,19 @@ std::pair<in_port_t, sa_family_t> MetricServer::getRunningConfig() const {
return {getListeningPort(), family};
}

nlohmann::json MetricServer::getRunningConfigAsJson() const {
nlohmann::json ret;
if (family == AF_INET) {
ret["host"] = "127.0.0.1";
ret["family"] = "inet";
} else {
ret["host"] = "::1";
ret["family"] = "inet6";
}
ret["port"] = getListeningPort();
ret["type"] = "prometheus";
ret["uuid"] = uuid;
return ret;
}

} // namespace cb::prometheus

0 comments on commit 19f7e19

Please sign in to comment.