/
Senpai.cpp
717 lines (657 loc) · 24.2 KB
/
Senpai.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
/*
* Copyright (C) 2018-present, Facebook, Inc.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; version 2 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include "oomd/plugins/Senpai.h"
#include <pthread.h>
#include <cerrno>
#include <csignal>
#include <future>
#include <iomanip>
#include <sstream>
#include "oomd/Log.h"
#include "oomd/PluginRegistry.h"
#include "oomd/util/Fs.h"
#include "oomd/util/PluginArgParser.h"
#include "oomd/util/ScopeGuard.h"
#include "oomd/util/Util.h"
namespace Oomd {
REGISTER_PLUGIN(senpai, Senpai::create);
int Senpai::init(
const Engine::PluginArgs& args,
const PluginConstructionContext& context) {
argParser_.addArgumentCustom(
"cgroup",
cgroups_,
[context](const std::string& cgroupStr) {
return PluginArgParser::parseCgroup(context, cgroupStr);
},
true);
argParser_.addArgument("limit_min_bytes", limit_min_bytes_);
argParser_.addArgument("limit_max_bytes", limit_max_bytes_);
argParser_.addArgument("interval", interval_);
argParser_.addArgument("pressure_ms", pressure_ms_);
argParser_.addArgument("pressure_pct", mem_pressure_pct_);
argParser_.addArgument("io_pressure_pct", io_pressure_pct_);
argParser_.addArgument("max_probe", max_probe_);
argParser_.addArgument("max_backoff", max_backoff_);
argParser_.addArgument("coeff_probe", coeff_probe_);
argParser_.addArgument("coeff_backoff", coeff_backoff_);
argParser_.addArgument("immediate_backoff", immediate_backoff_);
argParser_.addArgument("memory_high_timeout_ms", memory_high_timeout_);
argParser_.addArgument("swap_threshold", swap_threshold_);
argParser_.addArgument("swapout_bps_threshold", swapout_bps_threshold_);
argParser_.addArgument("swap_validation", swap_validation_);
argParser_.addArgument("modulate_swappiness", modulate_swappiness_);
argParser_.addArgument("log_interval", log_interval_);
if (!argParser_.parse(args)) {
return 1;
}
auto meminfo = Fs::getMeminfo();
// TODO(dschatzberg): Report Error
if (meminfo) {
if (auto pos = meminfo->find("MemTotal"); pos != meminfo->end()) {
host_mem_total_ = pos->second;
}
} else {
OLOG << "Cannot read host MemTotal";
return 1;
}
return 0;
}
Engine::PluginRet Senpai::run(OomdContext& ctx) {
auto resolved_cgroups = ctx.reverseSort(
cgroups_,
[](const CgroupContext& cgroup_ctx) { return cgroup_ctx.id(); });
// Use reverse iterator after reverseSort to make it normal order
auto resolvedIt = resolved_cgroups.crbegin();
auto trackedIt = tracked_cgroups_.begin();
bool do_aggregate_log = false;
if (++log_ticks_ >= log_interval_) {
log_ticks_ = 0;
do_aggregate_log = true;
}
// Iterate both tracked cgroups and resolved cgroups in increasing id order
while (resolvedIt != resolved_cgroups.crend()) {
const CgroupContext& cgroup_ctx = *resolvedIt;
// Use id to identify CgroupContext across intervals, as path, dir_fd, and
// memory address could all be recycled upon cgroup recreation.
auto id_opt = cgroup_ctx.id();
if (!id_opt) {
continue;
}
if (trackedIt == tracked_cgroups_.end() || *id_opt < trackedIt->first) {
// Resolved cgroup not in tracked map, track it
// New cgroups will be polled after a "tick" has elapsed
if (auto new_cgroup_state_opt = initializeCgroup(cgroup_ctx)) {
tracked_cgroups_.emplace_hint(
trackedIt, *id_opt, *new_cgroup_state_opt);
}
++resolvedIt;
} else if (*cgroup_ctx.id() > trackedIt->first) {
trackedIt = tracked_cgroups_.erase(trackedIt);
} else {
bool tick_result = immediate_backoff_
? tick_immediate_backoff(cgroup_ctx, trackedIt->second)
: tick(cgroup_ctx, trackedIt->second);
if (do_aggregate_log && tick_result) {
auto& state = trackedIt->second;
std::ostringstream oss;
oss << "cgroup " << cgroup_ctx.cgroup().relativePath() << " "
<< state.probe_count << " probe attempts (" << std::setprecision(3)
<< std::fixed << state.probe_bytes / (double)(1 << 30UL) << " gb)";
OLOG << oss.str();
// Reset stats
state.probe_count = 0;
state.probe_bytes = 0;
}
// Keep the tracked cgroups if they are still valid after tick
trackedIt = tick_result ? std::next(trackedIt)
: tracked_cgroups_.erase(trackedIt);
++resolvedIt;
}
}
tracked_cgroups_.erase(trackedIt, tracked_cgroups_.end());
return Engine::PluginRet::CONTINUE;
}
Senpai::CgroupState::CgroupState(
int64_t start_limit,
std::chrono::microseconds total,
int64_t start_ticks)
: limit{start_limit}, last_total{total}, ticks{start_ticks} {}
namespace {
// Get the total pressure (some) from a cgroup, or nullopt if cgroup is invalid
std::optional<std::chrono::microseconds> getPressureTotalSome(
const CgroupContext& cgroup_ctx) {
// Senpai reads pressure.some to get early notice that a workload
// may be under resource pressure
if (const auto pressure = Oomd::Fs::readMempressureAt(
cgroup_ctx.fd(), Oomd::Fs::PressureType::SOME)) {
if (const auto total = pressure.value().total) {
return total.value();
}
throw std::runtime_error("Senpai enabled but no total pressure info");
}
return std::nullopt;
}
} // namespace
// Check if the system support memory.reclaim cgroup control file. If the given
// cgroup supports it, the system supports it. The result is then stored and
// further calls won't access filesystem. If no stored result exists and the
// cgroup does not has memory controller enabled or is no longer valid, nullopt
// is returned.
std::optional<bool> Senpai::hasMemoryReclaim(const CgroupContext& cgroup_ctx) {
if (!has_memory_reclaim_.has_value()) {
if (auto controllers_maybe = Fs::readControllersAt(cgroup_ctx.fd());
controllers_maybe) {
for (const auto& ctrl : *controllers_maybe) {
if (ctrl == "memory") {
has_memory_reclaim_ =
(bool)Fs::checkExistAt(cgroup_ctx.fd(), Fs::kMemReclaimFile);
break;
}
}
}
}
return has_memory_reclaim_;
}
// Check if the system support memory.high.tmp cgroup control file. If the given
// cgroup supports it, the system supports it. The result is then stored and
// further calls won't access filesystem. If the cgroup is no longer valid and
// no stored result exists, nullopt is returned.
std::optional<bool> Senpai::hasMemoryHighTmp(const CgroupContext& cgroup_ctx) {
if (!has_memory_high_tmp_.has_value()) {
if (auto memhightmp = cgroup_ctx.memory_high_tmp()) {
has_memory_high_tmp_ = true;
} else if (auto memhigh = cgroup_ctx.memory_high()) {
// If memory.high exists but memory.high.tmp doesn't, it's not supported
has_memory_high_tmp_ = false;
}
// If neither exist, cgroup is invalid. Nothing changed.
}
return has_memory_high_tmp_;
}
// Read from memory.high.tmp (preferred) or memory.high of a given cgroup.
// Return nullopt if cgroup is no longer valid.
std::optional<int64_t> Senpai::readMemhigh(const CgroupContext& cgroup_ctx) {
if (auto has_memory_high_tmp = hasMemoryHighTmp(cgroup_ctx)) {
return *has_memory_high_tmp ? cgroup_ctx.memory_high_tmp()
: cgroup_ctx.memory_high();
}
return std::nullopt;
}
// Write to memory.high.tmp (preferred) or memory.high of a given cgroup.
// Return if the cgroup is still valid.
bool Senpai::writeMemhigh(const CgroupContext& cgroup_ctx, int64_t value) {
if (auto has_memory_high_tmp = hasMemoryHighTmp(cgroup_ctx)) {
if (*has_memory_high_tmp) {
if (!Oomd::Fs::writeMemhightmpAt(
cgroup_ctx.fd(), value, std::chrono::seconds(20))) {
return false;
}
} else if (!Oomd::Fs::writeMemhighAt(cgroup_ctx.fd(), value)) {
return false;
}
return true;
}
return false;
}
/*
* Invoke functor with some timeout. If functor does not return after timeout,
* a signal is sent to the thread running functor to interrupt the running
* syscall every second. Won't help if functor is uninterruptable or spinning.
*/
template <class Functor, class Duration>
SystemMaybe<typename std::invoke_result<Functor>::type> timed_invoke(
Functor&& fn,
Duration timeout) {
// ensure signal handler is setup before waiting on functor execution
std::promise<void> barrier;
auto barrier_future = barrier.get_future();
std::promise<typename std::invoke_result<Functor>::type> result;
auto future = result.get_future();
std::thread t(
[](auto&& barrier, auto&& result, Functor&& fn) {
// Empty signal handler to interrupt syscall in fn
std::signal(SIGUSR1, [](int) {});
barrier.set_value();
result.set_value(fn());
},
std::move(barrier),
std::move(result),
std::forward<Functor>(fn));
barrier_future.wait();
if (future.wait_for(timeout) == std::future_status::timeout) {
// Send signal to interrupt every second until we hear back from thread
do {
if (auto rc = ::pthread_kill(t.native_handle(), SIGUSR1); rc != 0) {
// thread already gone
if (rc == ESRCH) {
break;
}
// Something very wrong...
OLOG << systemError(rc, "pthread_kill failed").error().what();
std::terminate();
}
} while (future.wait_for(std::chrono::seconds(1)) ==
std::future_status::timeout);
t.join();
return systemError(ETIMEDOUT, "Timed out waiting execution");
} else {
t.join();
return future.get();
}
}
// Call writeMemhigh in a different thread and send signal to interrupt write
// after timeout. Workaround for a kernel "feature" that blocks such write
// indefinitely if reclaim target is too low.
bool Senpai::writeMemhighTimeout(
const CgroupContext& cgroup_ctx,
int64_t value,
std::chrono::milliseconds timeout) {
auto valid_maybe =
timed_invoke([&]() { return writeMemhigh(cgroup_ctx, value); }, timeout);
if (!valid_maybe) {
// Most likely write timed out. Assume cgroup still valid and verify later.
OLOG << "Failed to write memory limit for "
<< cgroup_ctx.cgroup().relativePath() << ": "
<< valid_maybe.error().what();
return true;
} else {
return valid_maybe.value();
}
}
// Reset memory.high.tmp (preferred) or memory.high of a given cgroup to max.
// Return if the cgroup is still valid.
bool Senpai::resetMemhigh(const CgroupContext& cgroup_ctx) {
if (auto has_memory_high_tmp = hasMemoryHighTmp(cgroup_ctx)) {
auto value = std::numeric_limits<int64_t>::max();
if (*has_memory_high_tmp) {
if (!Oomd::Fs::writeMemhightmpAt(
cgroup_ctx.fd(), value, std::chrono::seconds(0))) {
return false;
}
} else if (!Oomd::Fs::writeMemhighAt(cgroup_ctx.fd(), value)) {
return false;
}
return true;
}
return false;
}
// Reclaim some number of bytes from the given cgroup.
bool Senpai::reclaim(const CgroupContext& cgroup_ctx, int64_t size) {
auto has_memory_reclaim_opt = hasMemoryReclaim(cgroup_ctx);
if (has_memory_reclaim_opt && *has_memory_reclaim_opt) {
return (bool)Fs::writeMemReclaimAt(cgroup_ctx.fd(), size);
}
auto current_opt = cgroup_ctx.current_usage();
if (!current_opt) {
return false;
}
int64_t limit = *current_opt - size;
// Poking by setting memory limit and immediately resetting it, which
// prevents sudden allocation later from triggering thrashing
if (memory_high_timeout_.count() > 0) {
if (!writeMemhighTimeout(cgroup_ctx, limit, memory_high_timeout_)) {
return false;
}
} else {
if (!writeMemhigh(cgroup_ctx, limit)) {
return false;
}
}
if (!resetMemhigh(cgroup_ctx)) {
return false;
}
return true;
}
/** Returns file cache + swappable anon. */
SystemMaybe<int64_t> Senpai::getReclaimableBytes(
const CgroupContext& cgroup_ctx) {
const auto& stat_opt = cgroup_ctx.memory_stat();
if (!stat_opt) {
return SYSTEM_ERROR(ENOENT);
}
auto active_file_pos = stat_opt->find("active_file");
auto inactive_file_pos = stat_opt->find("inactive_file");
if (active_file_pos == stat_opt->end() ||
inactive_file_pos == stat_opt->end()) {
throw std::runtime_error("Invalid memory.stat cgroup file");
}
auto file_cache = active_file_pos->second + inactive_file_pos->second;
int64_t swappable = 0;
const auto& system_ctx = cgroup_ctx.oomd_ctx().getSystemContext();
if (system_ctx.swaptotal > 0 && system_ctx.swappiness > 0) {
auto effective_swap_free_opt = cgroup_ctx.effective_swap_free();
if (!effective_swap_free_opt) {
return SYSTEM_ERROR(ENOENT);
} else if (*effective_swap_free_opt > 0) {
auto active_anon_pos = stat_opt->find("active_anon");
auto inactive_anon_pos = stat_opt->find("inactive_anon");
if (active_anon_pos == stat_opt->end() ||
inactive_anon_pos == stat_opt->end()) {
return SYSTEM_ERROR(EINVAL);
}
auto anon_size = active_anon_pos->second + inactive_anon_pos->second;
swappable = std::min(*effective_swap_free_opt, anon_size);
}
}
return file_cache + swappable;
}
/** Returns unreclaimable + limit_min_bytes. */
std::optional<int64_t> Senpai::getLimitMinBytes(
const CgroupContext& cgroup_ctx) {
auto memcurr_opt = cgroup_ctx.current_usage();
if (!memcurr_opt) {
return std::nullopt;
}
auto reclaimable_maybe = getReclaimableBytes(cgroup_ctx);
if (!reclaimable_maybe) {
return std::nullopt;
}
auto unreclaimable = *memcurr_opt - *reclaimable_maybe;
auto limit_min_bytes = limit_min_bytes_ + unreclaimable;
auto memmin_opt = cgroup_ctx.memory_min();
if (!memmin_opt) {
return std::nullopt;
}
// Make sure memory.high don't go below memory.min
limit_min_bytes = std::max(limit_min_bytes, *memmin_opt);
return limit_min_bytes;
}
/**
* Return the minimum of the following:
* /proc/meminfo[MemTotal]
* memory.current + limit_max_bytes (default: 10G)
* memory.high (only if memory.high.tmp exist)
* memory.max
*/
std::optional<int64_t> Senpai::getLimitMaxBytes(
const CgroupContext& cgroup_ctx) {
auto memcurr_opt = cgroup_ctx.current_usage();
if (!memcurr_opt) {
return std::nullopt;
}
auto limit_max_bytes =
std::min(host_mem_total_, limit_max_bytes_ + *memcurr_opt);
// Don't let memory.high.tmp go above memory.high as kernel ignores the
// latter when the former is set.
auto has_memory_high_tmp_opt = hasMemoryHighTmp(cgroup_ctx);
if (!has_memory_high_tmp_opt) {
return std::nullopt;
}
if (*has_memory_high_tmp_opt) {
auto memhigh_opt = cgroup_ctx.memory_high();
if (!memhigh_opt) {
return std::nullopt;
}
limit_max_bytes = std::min(limit_max_bytes, *memhigh_opt);
}
auto memmax_opt = cgroup_ctx.memory_max();
if (!memmax_opt) {
return std::nullopt;
}
limit_max_bytes = std::min(limit_max_bytes, *memmax_opt);
return limit_max_bytes;
}
// Update state of a cgroup. Return if the cgroup is still valid.
bool Senpai::tick(const CgroupContext& cgroup_ctx, CgroupState& state) {
auto name = cgroup_ctx.cgroup().absolutePath();
auto limit_opt = readMemhigh(cgroup_ctx);
if (!limit_opt) {
return false;
}
auto factor = 0.0;
if (*limit_opt != state.limit) {
// Something else changed limits on this cgroup or it was
// recreated in-between ticks - reset the state and return,
// unfortuantely, the rest of this logic is still racy after this
// point
std::ostringstream oss;
oss << "cgroup " << name << " memory.high " << *limit_opt
<< " does not match recorded state " << state.limit
<< ". Resetting cgroup";
OLOG << oss.str();
if (auto state_opt = initializeCgroup(cgroup_ctx)) {
state = *state_opt;
return true;
}
return false;
}
// Adjust cgroup limit by factor
auto adjust = [&](double factor) {
auto limit_min_bytes_opt = getLimitMinBytes(cgroup_ctx);
if (!limit_min_bytes_opt) {
return false;
}
auto limit_max_bytes_opt = getLimitMaxBytes(cgroup_ctx);
if (!limit_max_bytes_opt) {
return false;
}
state.limit += state.limit * factor;
state.limit = std::max(
*limit_min_bytes_opt, std::min(*limit_max_bytes_opt, state.limit));
// Memory high is always a multiple of 4K
state.limit &= ~0xFFF;
state.ticks = interval_;
state.cumulative = std::chrono::microseconds{0};
return writeMemhigh(cgroup_ctx, state.limit);
};
auto total_opt = getPressureTotalSome(cgroup_ctx);
if (!total_opt) {
return false;
}
auto total = *total_opt;
auto delta = total - state.last_total;
state.last_total = total;
state.cumulative += delta;
auto cumulative = state.cumulative.count();
if (state.cumulative >= pressure_ms_) {
// Excessive pressure, back off. The rate scales exponentially
// with pressure deviation. The coefficient defines how sensitive
// we are to fluctuations around the target pressure: when the
// coefficient is 10, the adjustment curve reaches the backoff
// limit when observed pressure is ten times the target pressure.
double error = state.cumulative / pressure_ms_;
factor = error / coeff_backoff_;
factor *= factor;
factor = std::min(factor * max_backoff_, max_backoff_);
if (!adjust(factor)) {
return false;
}
std::ostringstream oss;
oss << "cgroup " << name << std::setprecision(3) << std::fixed
<< " limitgb " << *limit_opt / (double)(1 << 30UL) << " totalus "
<< total.count() << " deltaus " << delta.count() << " cumus "
<< cumulative << " ticks " << state.ticks << std::defaultfloat
<< " adjust " << factor;
OLOG << oss.str();
} else if (state.ticks) {
--state.ticks;
} else {
// Pressure too low, tighten the limit. Like when backing off, the
// adjustment becomes exponentially more aggressive as observed
// pressure falls below the target pressure. The adjustment limit
// is reached when stall time falls through pressure/coeff_probe_.
auto one = std::chrono::microseconds{1};
double error = pressure_ms_ / std::max(state.cumulative, one);
factor = error / coeff_probe_;
factor *= factor;
factor = std::min(factor * max_probe_, max_probe_);
factor = -factor;
if (!adjust(factor)) {
return false;
}
if (*limit_opt > state.limit) {
state.probe_count++;
state.probe_bytes += *limit_opt - state.limit;
}
}
return true;
}
// Update state of a cgroup. Return if the cgroup is still valid.
bool Senpai::tick_immediate_backoff(
const CgroupContext& cgroup_ctx,
CgroupState& state) {
// Wait for interval to prevent making senpai too aggressive
// May wait longer if pressures are too high
if (state.ticks) {
state.ticks--;
return true;
}
auto validate_pressure_maybe = validatePressure(cgroup_ctx);
if (!validate_pressure_maybe) {
return false;
}
auto validate = *validate_pressure_maybe;
if (swap_validation_) {
auto validate_swap_maybe = validateSwap(cgroup_ctx);
if (!validate_swap_maybe) {
return false;
}
validate = validate && *validate_swap_maybe;
}
if (validate) {
auto limit_min_bytes_opt = getLimitMinBytes(cgroup_ctx);
if (!limit_min_bytes_opt) {
return false;
}
auto current_opt = cgroup_ctx.current_usage();
if (!current_opt) {
return false;
}
if (*current_opt > *limit_min_bytes_opt) {
int original_swappiness;
if (modulate_swappiness_) {
original_swappiness =
cgroup_ctx.oomd_ctx().getSystemContext().swappiness;
auto swappiness_factor_maybe = calculateSwappinessFactor(cgroup_ctx);
if (!swappiness_factor_maybe) {
return false;
}
Fs::setSwappiness(original_swappiness * (*swappiness_factor_maybe));
}
OOMD_SCOPE_EXIT {
if (modulate_swappiness_) {
Fs::setSwappiness(original_swappiness);
}
};
// Reclaim slowly towards limit_min_bytes
int64_t reclaim_size = (*current_opt - *limit_min_bytes_opt) * max_probe_;
// Reclaim in number of 4k pages
reclaim_size &= ~0xFFF;
if (!reclaim(cgroup_ctx, reclaim_size)) {
return false;
}
state.probe_count++;
state.probe_bytes += reclaim_size;
state.ticks = interval_;
}
}
return true;
}
// Initialize a CgroupState. Return nullopt if cgroup no longer valid.
std::optional<Senpai::CgroupState> Senpai::initializeCgroup(
const CgroupContext& cgroup_ctx) {
int64_t start_limit = 0;
// Immediate backoff does not use limit as a state.
if (!immediate_backoff_) {
auto current_opt = cgroup_ctx.current_usage();
if (!current_opt) {
return std::nullopt;
}
if (!writeMemhigh(cgroup_ctx, *current_opt)) {
return std::nullopt;
}
start_limit = *current_opt;
}
auto total_opt = getPressureTotalSome(cgroup_ctx);
if (!total_opt) {
return std::nullopt;
}
return CgroupState(start_limit, *total_opt, interval_);
}
// Validate that pressure is low enough to drive Senpai
SystemMaybe<bool> Senpai::validatePressure(
const CgroupContext& cgroup_ctx) const {
auto mem_pressure_opt = cgroup_ctx.mem_pressure_some();
if (!mem_pressure_opt) {
return SYSTEM_ERROR(ENOENT);
}
auto io_pressure_opt = cgroup_ctx.io_pressure_some();
if (!io_pressure_opt) {
return SYSTEM_ERROR(ENOENT);
}
// Only drive senpai if both short and long term pressure from memory and I/O
// are lower than target
return std::max(mem_pressure_opt->sec_10, mem_pressure_opt->sec_60) <
mem_pressure_pct_ &&
std::max(io_pressure_opt->sec_10, io_pressure_opt->sec_60) <
io_pressure_pct_;
}
// Validate that swap is sufficient to run Senpai
SystemMaybe<bool> Senpai::validateSwap(const CgroupContext& cgroup_ctx) const {
const auto& system_ctx = cgroup_ctx.oomd_ctx().getSystemContext();
// If there's no swap at all, then there's nothing to validate
if (system_ctx.swaptotal == 0 || system_ctx.swappiness == 0) {
return true;
}
// Similarly if effective swap.max is zero, nothing to validate
auto effective_swap_max_opt = cgroup_ctx.effective_swap_max();
if (!effective_swap_max_opt) {
return SYSTEM_ERROR(ENOENT);
}
if (*effective_swap_max_opt == 0) {
return true;
}
// We validate that the effective swap usage is below the defined
// threshold. This is useful to prevent OOM killing due to swap
// depletion.
auto effective_swap_util_pct_opt = cgroup_ctx.effective_swap_util_pct();
if (!effective_swap_util_pct_opt) {
return SYSTEM_ERROR(ENOENT);
}
return *effective_swap_util_pct_opt >= swap_threshold_;
}
// Calculate swappiness factor (between 0 and 1) for a cgroup to modulate swap
// behavior.
SystemMaybe<double> Senpai::calculateSwappinessFactor(
const CgroupContext& cgroup_ctx) const {
if (swap_threshold_ <= 0) {
return 0;
}
auto swapout_bps_60 = cgroup_ctx.oomd_ctx().getSystemContext().swapout_bps_60;
auto swapout_bps_300 =
cgroup_ctx.oomd_ctx().getSystemContext().swapout_bps_300;
auto swapout_bps = std::max(swapout_bps_60, swapout_bps_300);
if (swapout_bps >= swapout_bps_threshold_) {
return 0;
}
// If system has swapout bps close to or above threshold, factor will be close
// to or equal to 0. If instead rate is close to 0, factor approaches 1.
auto limit_by_rate = 1.0 - swapout_bps / swapout_bps_threshold_;
auto effective_swap_util_pct_opt = cgroup_ctx.effective_swap_util_pct();
if (!effective_swap_util_pct_opt) {
return SYSTEM_ERROR(ENOENT);
}
if (*effective_swap_util_pct_opt >= swap_threshold_) {
return 0;
}
// If cgroup has swap usage close to or above threshold, factor will be close
// to or equal to 0. If instead usage is close to 0, factor approaches 1.
auto limit_by_size = 1.0 - *effective_swap_util_pct_opt / swap_threshold_;
return std::min(limit_by_rate, limit_by_size);
}
} // namespace Oomd