Skip to content

Commit

Permalink
chore: add oom stats to /metrics (#2680)
Browse files Browse the repository at this point in the history
* chore: add oom stats to /metrics

Expose oom/cmd errors when we reject executing a command if we reached OOM state (controlled by oom_deny_ratio flag).
Expose oom/insert errors when we do not insert a new key or do not grow a dashtable (controlled by table_growth_margin).

Move OOM command check to a place that covers all types of transactions - including multi and squashing transactions.

---------

Signed-off-by: Roman Gershman <roman@dragonflydb.io>
  • Loading branch information
romange committed Mar 3, 2024
1 parent 34895ae commit 082c5e0
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 12 deletions.
24 changes: 13 additions & 11 deletions src/server/main_service.cc
Original file line number Diff line number Diff line change
Expand Up @@ -957,7 +957,19 @@ static optional<ErrorReply> VerifyConnectionAclStatus(const CommandId* cid,
optional<ErrorReply> Service::VerifyCommandExecution(const CommandId* cid,
const ConnectionContext* cntx,
CmdArgList tail_args) {
// TODO: Move OOM check here
ServerState& etl = *ServerState::tlocal();

if ((cid->opt_mask() & CO::DENYOOM) && etl.is_master) {
uint64_t start_ns = absl::GetCurrentTimeNanos();

uint64_t used_memory = etl.GetUsedMemory(start_ns);
double oom_deny_ratio = GetFlag(FLAGS_oom_deny_ratio);
if (used_memory > (max_memory_limit * oom_deny_ratio)) {
etl.stats.oom_error_cmd_cnt++;
return facade::ErrorReply{kOutOfMemory};
}
}

return VerifyConnectionAclStatus(cid, cntx, "ACL rules changed between the MULTI and EXEC",
tail_args);
}
Expand Down Expand Up @@ -1125,16 +1137,6 @@ void Service::DispatchCommand(CmdArgList args, facade::ConnectionContext* cntx)
return cntx->SendSimpleString("QUEUED");
}

if (cid->opt_mask() & CO::DENYOOM && etl.is_master) {
uint64_t start_ns = absl::GetCurrentTimeNanos();

uint64_t used_memory = etl.GetUsedMemory(start_ns);
double oom_deny_ratio = GetFlag(FLAGS_oom_deny_ratio);
if (used_memory > (max_memory_limit * oom_deny_ratio)) {
return cntx->reply_builder()->SendError(kOutOfMemory);
}
}

// Create command transaction
intrusive_ptr<Transaction> dist_trans;

Expand Down
7 changes: 7 additions & 0 deletions src/server/server_family.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1007,6 +1007,13 @@ void PrintPrometheusMetrics(const Metrics& m, StringResponse* resp) {
&resp->body());
AppendMetricWithoutLabels("memory_max_bytes", "", max_memory_limit, MetricType::GAUGE,
&resp->body());

if (m.events.insertion_rejections | m.coordinator_stats.oom_error_cmd_cnt) {
AppendMetricValue("oom_errors_total", m.events.insertion_rejections, {"type"}, {"insert"},
&resp->body());
AppendMetricValue("oom_errors_total", m.coordinator_stats.oom_error_cmd_cnt, {"type"}, {"cmd"},
&resp->body());
}
if (sdata_res.has_value()) {
size_t rss = sdata_res->vm_rss + sdata_res->hugetlb_pages;
AppendMetricWithoutLabels("used_memory_rss_bytes", "", rss, MetricType::GAUGE, &resp->body());
Expand Down
3 changes: 2 additions & 1 deletion src/server/server_state.cc
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ auto ServerState::Stats::operator=(Stats&& other) -> Stats& {
}

ServerState::Stats& ServerState::Stats::Add(unsigned num_shards, const ServerState::Stats& other) {
static_assert(sizeof(Stats) == 14 * 8, "Stats size mismatch");
static_assert(sizeof(Stats) == 15 * 8, "Stats size mismatch");

for (int i = 0; i < NUM_TX_TYPES; ++i) {
this->tx_type_cnt[i] += other.tx_type_cnt[i];
Expand All @@ -64,6 +64,7 @@ ServerState::Stats& ServerState::Stats::Add(unsigned num_shards, const ServerSta
this->multi_squash_exec_reply_usec += other.multi_squash_exec_reply_usec;

this->blocked_on_interpreter += other.blocked_on_interpreter;
this->oom_error_cmd_cnt += other.oom_error_cmd_cnt;

if (this->tx_width_freq_arr == nullptr) {
this->tx_width_freq_arr = new uint64_t[num_shards];
Expand Down
3 changes: 3 additions & 0 deletions src/server/server_state.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,9 @@ class ServerState { // public struct - to allow initialization.

uint64_t blocked_on_interpreter = 0;

// Number of times we rejected command dispatch due to OOM condition.
uint64_t oom_error_cmd_cnt = 0;

// Array of size of number of shards.
// Each entry is how many transactions we had with this width (unique_shard_cnt).
uint64_t* tx_width_freq_arr = nullptr;
Expand Down

0 comments on commit 082c5e0

Please sign in to comment.