Skip to content

Commit 6616b3d

Browse files
committed
mm/memcg: Free percpu stats memory of dying memcg's
Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2176388 Upstream Status: RHEL-only For systems with large number of CPUs, the majority of the memory consumed by the mem_cgroup structure is actually the percpu stats memory. When a large number of memory cgroups are continuously created and destroyed (like in a container host), it is possible that more and more mem_cgroup structures remained in the dying state holding up increasing amount of percpu memory. We can't free up the memory of the dying mem_cgroup structure due to active references mainly from pages in the page cache. However, the percpu stats memory allocated to that mem_cgroup is a different story. There are 2 sets of percpu stat counters in the mem_cgroup structure and the associated mem_cgroup_per_node structure. - vmstats_percpu (struct mem_cgroup) - lruvec_stat_percpu (struct mem_cgroup_per_node) There is discussion upstream about the best way to handle dying memory cgroups that hang around indefinitely, mostly due to shared memory. See https://lwn.net/Articles/932070/ for more information. It looks like a final solution may still need some more time. This patch is a workaround by freeing the percpu stats memory associated with a dying memory cgroup. This will eliminates the percpu memory increase problem, but we will still see increase in slab memory consumption associated with the dying memory cgroups. As a workaround, it is not likely to be accepted upstream, but a lot of RHEL customers are seeing this percpu memory increase problem. A new percpu_stats_disabled variable is added to keep track of the state of the percpu stats memory. If the variable is set, percpu stats update will be disabled for that particular memcg and forwarded to a parent memcg. The disabling, flushing and freeing of the percpu stats memory is a multi-step process. The percpu_stats_disabled variable is set to MEMCG_PERCPU_STATS_DISABLED first when the memcg is being set to an offline state. At this point, the cgroup filesystem control files corresponding to the offline cgroups is being removed and will no longer be visible in user space. After a grace period with the help of rcu_work, no task should be reading or updating percpu stats at that point. The percpu_stats_disabled variable is then atomically set to PERCPU_STATS_FLUSHING before flushing out the percpu stats and changing its state to PERCPU_STATS_FLUSHED. The percpu memory is then freed and the state is changed to PERCPU_STATS_FREED. This will greatly reduce the amount of memory held up by dying memory cgroups. For the compiled RHEL9 kernel, memcg_vmstats_percpu and lruvec_stats_percpu have a size of 1080 and 672 bytes respectively. The mem_cgroup and mem_cgroup_per_node structures have a size of 2240 and 1096 bytes respectively. For a 2-socket 96-thread system, that means each dying memory cgroup use 232,704 bytes of percpu data and 3,338 bytes of memcg slab data. The percpu/slab ratio is 69. The ratio can be even higher for larger systems with many CPUs. By freeing the percpu memory, the dying memory cgroups will now consume much less memory than before. This patch does introduce a bit of performance overhead when doing memcg stat update especially __mod_memcg_lruvec_state(). This RHEL-only patch will be reverted when the upstream fix is finalized and being merged into RHEL9. Signed-off-by: Waiman Long <longman@redhat.com>
1 parent e02ca32 commit 6616b3d

File tree

2 files changed

+125
-7
lines changed

2 files changed

+125
-7
lines changed

include/linux/memcontrol.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,8 @@ struct mem_cgroup_per_node {
134134
unsigned long usage_in_excess;/* Set to the value by which */
135135
/* the soft limit is exceeded*/
136136
bool on_tree;
137+
RH_KABI_EXTEND(unsigned short nid)
138+
137139
struct mem_cgroup *memcg; /* Back pointer, we cannot */
138140
/* use container_of */
139141
};
@@ -343,6 +345,12 @@ struct mem_cgroup {
343345
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
344346
struct deferred_split deferred_split_queue;
345347
#endif
348+
/*
349+
* Disable percpu stats when offline, flush and free them after one
350+
* grace period.
351+
*/
352+
RH_KABI_EXTEND(struct rcu_work percpu_stats_rwork)
353+
RH_KABI_EXTEND(int percpu_stats_disabled)
346354

347355
struct mem_cgroup_per_node *nodeinfo[];
348356
};
@@ -1013,6 +1021,9 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
10131021
return node_page_state(lruvec_pgdat(lruvec), idx);
10141022

10151023
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
1024+
if (pn->memcg->percpu_stats_disabled)
1025+
return 0;
1026+
10161027
for_each_possible_cpu(cpu)
10171028
x += per_cpu(pn->lruvec_stats_percpu->state[idx], cpu);
10181029
#ifdef CONFIG_SMP

mm/memcontrol.c

Lines changed: 114 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,14 @@ enum res_type {
214214
_TCP,
215215
};
216216

217+
enum percpu_stats_state {
218+
PERCPU_STATS_ACTIVE = 0,
219+
PERCPU_STATS_DISABLED,
220+
PERCPU_STATS_FLUSHING,
221+
PERCPU_STATS_FLUSHED,
222+
PERCPU_STATS_FREED
223+
};
224+
217225
#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
218226
#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
219227
#define MEMFILE_ATTR(val) ((val) & 0xffff)
@@ -737,6 +745,30 @@ unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
737745
return x;
738746
}
739747

748+
/*
749+
* Return the active percpu stats memcg and optionally mem_cgroup_per_node.
750+
*
751+
* When percpu_stats_disabled, the percpu stats update is transferred to
752+
* its parent.
753+
*/
754+
static __always_inline struct mem_cgroup *
755+
percpu_stats_memcg(struct mem_cgroup *memcg, struct mem_cgroup_per_node **pn)
756+
{
757+
if (likely(!memcg->percpu_stats_disabled))
758+
return memcg;
759+
760+
do {
761+
memcg = parent_mem_cgroup(memcg);
762+
} while (memcg->percpu_stats_disabled);
763+
764+
if (pn) {
765+
unsigned int nid = (*pn)->nid;
766+
767+
*pn = memcg->nodeinfo[nid];
768+
}
769+
return memcg;
770+
}
771+
740772
/**
741773
* __mod_memcg_state - update cgroup memory statistics
742774
* @memcg: the memory cgroup
@@ -748,6 +780,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
748780
if (mem_cgroup_disabled())
749781
return;
750782

783+
memcg = percpu_stats_memcg(memcg, NULL);
751784
__this_cpu_add(memcg->vmstats_percpu->state[idx], val);
752785
memcg_rstat_updated(memcg, val);
753786
}
@@ -758,6 +791,9 @@ static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
758791
long x = 0;
759792
int cpu;
760793

794+
if (unlikely(memcg->percpu_stats_disabled))
795+
return 0;
796+
761797
for_each_possible_cpu(cpu)
762798
x += per_cpu(memcg->vmstats_percpu->state[idx], cpu);
763799
#ifdef CONFIG_SMP
@@ -774,7 +810,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
774810
struct mem_cgroup *memcg;
775811

776812
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
777-
memcg = pn->memcg;
813+
memcg = percpu_stats_memcg(pn->memcg, &pn);
778814

779815
/*
780816
* The caller from rmap relay on disabled preemption becase they never
@@ -838,6 +874,7 @@ void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
838874

839875
rcu_read_lock();
840876
memcg = page_memcg(head);
877+
841878
/* Untracked pages have no memcg, no lruvec. Update only the node */
842879
if (!memcg) {
843880
rcu_read_unlock();
@@ -889,6 +926,7 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
889926
if (mem_cgroup_disabled() || index < 0)
890927
return;
891928

929+
memcg = percpu_stats_memcg(memcg, NULL);
892930
memcg_stats_lock();
893931
__this_cpu_add(memcg->vmstats_percpu->events[index], count);
894932
memcg_rstat_updated(memcg, count);
@@ -913,6 +951,9 @@ static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
913951
if (index < 0)
914952
return 0;
915953

954+
if (unlikely(memcg->percpu_stats_disabled))
955+
return 0;
956+
916957
for_each_possible_cpu(cpu)
917958
x += per_cpu(memcg->vmstats_percpu->events[index], cpu);
918959
return x;
@@ -921,6 +962,8 @@ static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
921962
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
922963
int nr_pages)
923964
{
965+
memcg = percpu_stats_memcg(memcg, NULL);
966+
924967
/* pagein of a big page is an event. So, ignore page size */
925968
if (nr_pages > 0)
926969
__count_memcg_events(memcg, PGPGIN, 1);
@@ -937,6 +980,8 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
937980
{
938981
unsigned long val, next;
939982

983+
memcg = percpu_stats_memcg(memcg, NULL);
984+
940985
val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
941986
next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
942987
/* from time_after() in jiffies.h */
@@ -5203,6 +5248,7 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
52035248

52045249
lruvec_init(&pn->lruvec);
52055250
pn->memcg = memcg;
5251+
pn->nid = node;
52065252

52075253
memcg->nodeinfo[node] = pn;
52085254
return 0;
@@ -5215,7 +5261,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
52155261
if (!pn)
52165262
return;
52175263

5218-
free_percpu(pn->lruvec_stats_percpu);
5264+
//free_percpu(pn->lruvec_stats_percpu);
52195265
kfree(pn);
52205266
}
52215267

@@ -5226,7 +5272,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
52265272
for_each_node(node)
52275273
free_mem_cgroup_per_node_info(memcg, node);
52285274
kfree(memcg->vmstats);
5229-
free_percpu(memcg->vmstats_percpu);
5275+
//free_percpu(memcg->vmstats_percpu);
52305276
kfree(memcg);
52315277
}
52325278

@@ -5301,6 +5347,61 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
53015347
return ERR_PTR(error);
53025348
}
53035349

5350+
/*
5351+
* Flush and free the percpu stats
5352+
*/
5353+
static void percpu_stats_free_rwork_fn(struct work_struct *work)
5354+
{
5355+
struct mem_cgroup *memcg = container_of(to_rcu_work(work),
5356+
struct mem_cgroup,
5357+
percpu_stats_rwork);
5358+
int node;
5359+
5360+
if (cmpxchg(&memcg->percpu_stats_disabled, PERCPU_STATS_DISABLED,
5361+
PERCPU_STATS_FLUSHING) != PERCPU_STATS_DISABLED) {
5362+
static DEFINE_RATELIMIT_STATE(_rs,
5363+
DEFAULT_RATELIMIT_INTERVAL,
5364+
DEFAULT_RATELIMIT_BURST);
5365+
5366+
if (__ratelimit(&_rs))
5367+
WARN(1, "percpu_stats_free_rwork_fn() called more than once!\n");
5368+
return;
5369+
}
5370+
5371+
cgroup_rstat_flush_hold(memcg->css.cgroup);
5372+
WRITE_ONCE(memcg->percpu_stats_disabled, PERCPU_STATS_FLUSHED);
5373+
cgroup_rstat_flush_release();
5374+
5375+
for_each_node(node) {
5376+
struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
5377+
5378+
if (pn)
5379+
free_percpu(pn->lruvec_stats_percpu);
5380+
}
5381+
free_percpu(memcg->vmstats_percpu);
5382+
WRITE_ONCE(memcg->percpu_stats_disabled, PERCPU_STATS_FREED);
5383+
css_put(&memcg->css);
5384+
}
5385+
5386+
static void memcg_percpu_stats_disable(struct mem_cgroup *memcg)
5387+
{
5388+
/*
5389+
* Block memcg from being freed before percpu_stats_free_rwork_fn()
5390+
* is called. css_get() will succeed before a potential final
5391+
* css_put() in mem_cgroup_id_put().
5392+
*/
5393+
css_get(&memcg->css);
5394+
mem_cgroup_id_put(memcg);
5395+
memcg->percpu_stats_disabled = PERCPU_STATS_DISABLED;
5396+
INIT_RCU_WORK(&memcg->percpu_stats_rwork, percpu_stats_free_rwork_fn);
5397+
queue_rcu_work(system_wq, &memcg->percpu_stats_rwork);
5398+
}
5399+
5400+
static inline bool memcg_percpu_stats_flushed(struct mem_cgroup *memcg)
5401+
{
5402+
return memcg->percpu_stats_disabled >= PERCPU_STATS_FLUSHED;
5403+
}
5404+
53045405
static struct cgroup_subsys_state * __ref
53055406
mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
53065407
{
@@ -5400,7 +5501,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
54005501

54015502
drain_all_stock(memcg);
54025503

5403-
mem_cgroup_id_put(memcg);
5504+
memcg_percpu_stats_disable(memcg);
54045505
}
54055506

54065507
static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
@@ -5469,6 +5570,9 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
54695570
long delta, v;
54705571
int i, nid;
54715572

5573+
if (memcg_percpu_stats_flushed(memcg))
5574+
return;
5575+
54725576
statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
54735577

54745578
for (i = 0; i < MEMCG_NR_STAT; i++) {
@@ -6964,6 +7068,7 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug)
69647068
static void uncharge_batch(const struct uncharge_gather *ug)
69657069
{
69667070
unsigned long flags;
7071+
struct mem_cgroup *memcg;
69677072

69687073
if (ug->nr_memory) {
69697074
page_counter_uncharge(&ug->memcg->memory, ug->nr_memory);
@@ -6974,10 +7079,12 @@ static void uncharge_batch(const struct uncharge_gather *ug)
69747079
memcg_oom_recover(ug->memcg);
69757080
}
69767081

7082+
memcg = percpu_stats_memcg(ug->memcg, NULL);
7083+
69777084
local_irq_save(flags);
6978-
__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
6979-
__this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory);
6980-
memcg_check_events(ug->memcg, ug->nid);
7085+
__count_memcg_events(memcg, PGPGOUT, ug->pgpgout);
7086+
__this_cpu_add(memcg->vmstats_percpu->nr_page_events, ug->nr_memory);
7087+
memcg_check_events(memcg, ug->nid);
69817088
local_irq_restore(flags);
69827089

69837090
/* drop reference from uncharge_folio */

0 commit comments

Comments
 (0)