Skip to content

Commit e23754e

Browse files
committed
Merge: mm/memcg: Free percpu stats memory of dying memcg's
MR: https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-9/-/merge_requests/2580 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2176388 Upstream Status: RHEL-only For systems with large number of CPUs, the majority of the memory consumed by the mem_cgroup structure is actually the percpu stats memory. When a large number of memory cgroups are continuously created and destroyed (like in a container host), it is possible that more and more mem_cgroup structures remained in the dying state holding up increasing amount of percpu memory. We can't free up the memory of the dying mem_cgroup structure due to active references mainly from pages in the page cache. However, the percpu stats memory allocated to that mem_cgroup is a different story. There are 2 sets of percpu stat counters in the mem_cgroup structure and the associated mem_cgroup_per_node structure. - vmstats_percpu (struct mem_cgroup) - lruvec_stat_percpu (struct mem_cgroup_per_node) There is discussion upstream about the best way to handle dying memory cgroups that hang around indefinitely, mostly due to shared memory. See https://lwn.net/Articles/932070/ for more information. It looks like a final solution may still need some more time. This patch is a workaround by freeing the percpu stats memory associated with a dying memory cgroup. This will eliminates the percpu memory increase problem, but we will still see increase in slab memory consumption associated with the dying memory cgroups. As a workaround, it is not likely to be accepted upstream, but a lot of RHEL customers are seeing this percpu memory increase problem. A new percpu_stats_disabled variable is added to keep track of the state of the percpu stats memory. If the variable is set, percpu stats update will be disabled for that particular memcg and forwarded to a parent memcg. The disabling, flushing and freeing of the percpu stats memory is a multi-step process. The percpu_stats_disabled variable is set to MEMCG_PERCPU_STATS_DISABLED first when the memcg is being set to an offline state. At this point, the cgroup filesystem control files corresponding to the offline cgroups is being removed and will no longer be visible in user space. After a grace period with the help of rcu_work, no task should be reading or updating percpu stats at that point. The percpu_stats_disabled variable is then atomically set to PERCPU_STATS_FLUSHING before flushing out the percpu stats and changing its state to PERCPU_STATS_FLUSHED. The percpu memory is then freed and the state is changed to PERCPU_STATS_FREED. This will greatly reduce the amount of memory held up by dying memory cgroups. For the compiled RHEL9 kernel, memcg_vmstats_percpu and lruvec_stats_percpu have a size of 1080 and 672 bytes respectively. The mem_cgroup and mem_cgroup_per_node structures have a size of 2240 and 1096 bytes respectively. For a 2-socket 96-thread system, that means each dying memory cgroup use 232,704 bytes of percpu data and 3,338 bytes of memcg slab data. The percpu/slab ratio is 69. The ratio can be even higher for larger systems with many CPUs. By freeing the percpu memory, the dying memory cgroups will now consume much less memory than before. This patch does introduce a bit of performance overhead when doing memcg stat update especially __mod_memcg_lruvec_state(). This RHEL-only patch will be reverted when the upstream fix is finalized and being merged into RHEL9. Signed-off-by: Waiman Long <longman@redhat.com> Approved-by: Rafael Aquini <aquini@redhat.com> Approved-by: Aristeu Rozanski <arozansk@redhat.com> Approved-by: Phil Auld <pauld@redhat.com> Signed-off-by: Jan Stancek <jstancek@redhat.com>
2 parents 1f327c5 + 6616b3d commit e23754e

File tree

2 files changed

+125
-7
lines changed

2 files changed

+125
-7
lines changed

include/linux/memcontrol.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,8 @@ struct mem_cgroup_per_node {
134134
unsigned long usage_in_excess;/* Set to the value by which */
135135
/* the soft limit is exceeded*/
136136
bool on_tree;
137+
RH_KABI_EXTEND(unsigned short nid)
138+
137139
struct mem_cgroup *memcg; /* Back pointer, we cannot */
138140
/* use container_of */
139141
};
@@ -343,6 +345,12 @@ struct mem_cgroup {
343345
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
344346
struct deferred_split deferred_split_queue;
345347
#endif
348+
/*
349+
* Disable percpu stats when offline, flush and free them after one
350+
* grace period.
351+
*/
352+
RH_KABI_EXTEND(struct rcu_work percpu_stats_rwork)
353+
RH_KABI_EXTEND(int percpu_stats_disabled)
346354

347355
struct mem_cgroup_per_node *nodeinfo[];
348356
};
@@ -1013,6 +1021,9 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
10131021
return node_page_state(lruvec_pgdat(lruvec), idx);
10141022

10151023
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
1024+
if (pn->memcg->percpu_stats_disabled)
1025+
return 0;
1026+
10161027
for_each_possible_cpu(cpu)
10171028
x += per_cpu(pn->lruvec_stats_percpu->state[idx], cpu);
10181029
#ifdef CONFIG_SMP

mm/memcontrol.c

Lines changed: 114 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,14 @@ enum res_type {
214214
_TCP,
215215
};
216216

217+
enum percpu_stats_state {
218+
PERCPU_STATS_ACTIVE = 0,
219+
PERCPU_STATS_DISABLED,
220+
PERCPU_STATS_FLUSHING,
221+
PERCPU_STATS_FLUSHED,
222+
PERCPU_STATS_FREED
223+
};
224+
217225
#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
218226
#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
219227
#define MEMFILE_ATTR(val) ((val) & 0xffff)
@@ -737,6 +745,30 @@ unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
737745
return x;
738746
}
739747

748+
/*
749+
* Return the active percpu stats memcg and optionally mem_cgroup_per_node.
750+
*
751+
* When percpu_stats_disabled, the percpu stats update is transferred to
752+
* its parent.
753+
*/
754+
static __always_inline struct mem_cgroup *
755+
percpu_stats_memcg(struct mem_cgroup *memcg, struct mem_cgroup_per_node **pn)
756+
{
757+
if (likely(!memcg->percpu_stats_disabled))
758+
return memcg;
759+
760+
do {
761+
memcg = parent_mem_cgroup(memcg);
762+
} while (memcg->percpu_stats_disabled);
763+
764+
if (pn) {
765+
unsigned int nid = (*pn)->nid;
766+
767+
*pn = memcg->nodeinfo[nid];
768+
}
769+
return memcg;
770+
}
771+
740772
/**
741773
* __mod_memcg_state - update cgroup memory statistics
742774
* @memcg: the memory cgroup
@@ -748,6 +780,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
748780
if (mem_cgroup_disabled())
749781
return;
750782

783+
memcg = percpu_stats_memcg(memcg, NULL);
751784
__this_cpu_add(memcg->vmstats_percpu->state[idx], val);
752785
memcg_rstat_updated(memcg, val);
753786
}
@@ -758,6 +791,9 @@ static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
758791
long x = 0;
759792
int cpu;
760793

794+
if (unlikely(memcg->percpu_stats_disabled))
795+
return 0;
796+
761797
for_each_possible_cpu(cpu)
762798
x += per_cpu(memcg->vmstats_percpu->state[idx], cpu);
763799
#ifdef CONFIG_SMP
@@ -774,7 +810,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
774810
struct mem_cgroup *memcg;
775811

776812
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
777-
memcg = pn->memcg;
813+
memcg = percpu_stats_memcg(pn->memcg, &pn);
778814

779815
/*
780816
* The caller from rmap relay on disabled preemption becase they never
@@ -838,6 +874,7 @@ void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
838874

839875
rcu_read_lock();
840876
memcg = page_memcg(head);
877+
841878
/* Untracked pages have no memcg, no lruvec. Update only the node */
842879
if (!memcg) {
843880
rcu_read_unlock();
@@ -889,6 +926,7 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
889926
if (mem_cgroup_disabled() || index < 0)
890927
return;
891928

929+
memcg = percpu_stats_memcg(memcg, NULL);
892930
memcg_stats_lock();
893931
__this_cpu_add(memcg->vmstats_percpu->events[index], count);
894932
memcg_rstat_updated(memcg, count);
@@ -913,6 +951,9 @@ static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
913951
if (index < 0)
914952
return 0;
915953

954+
if (unlikely(memcg->percpu_stats_disabled))
955+
return 0;
956+
916957
for_each_possible_cpu(cpu)
917958
x += per_cpu(memcg->vmstats_percpu->events[index], cpu);
918959
return x;
@@ -921,6 +962,8 @@ static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
921962
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
922963
int nr_pages)
923964
{
965+
memcg = percpu_stats_memcg(memcg, NULL);
966+
924967
/* pagein of a big page is an event. So, ignore page size */
925968
if (nr_pages > 0)
926969
__count_memcg_events(memcg, PGPGIN, 1);
@@ -937,6 +980,8 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
937980
{
938981
unsigned long val, next;
939982

983+
memcg = percpu_stats_memcg(memcg, NULL);
984+
940985
val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
941986
next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
942987
/* from time_after() in jiffies.h */
@@ -5220,6 +5265,7 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
52205265

52215266
lruvec_init(&pn->lruvec);
52225267
pn->memcg = memcg;
5268+
pn->nid = node;
52235269

52245270
memcg->nodeinfo[node] = pn;
52255271
return 0;
@@ -5232,7 +5278,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
52325278
if (!pn)
52335279
return;
52345280

5235-
free_percpu(pn->lruvec_stats_percpu);
5281+
//free_percpu(pn->lruvec_stats_percpu);
52365282
kfree(pn);
52375283
}
52385284

@@ -5243,7 +5289,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
52435289
for_each_node(node)
52445290
free_mem_cgroup_per_node_info(memcg, node);
52455291
kfree(memcg->vmstats);
5246-
free_percpu(memcg->vmstats_percpu);
5292+
//free_percpu(memcg->vmstats_percpu);
52475293
kfree(memcg);
52485294
}
52495295

@@ -5318,6 +5364,61 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
53185364
return ERR_PTR(error);
53195365
}
53205366

5367+
/*
5368+
* Flush and free the percpu stats
5369+
*/
5370+
static void percpu_stats_free_rwork_fn(struct work_struct *work)
5371+
{
5372+
struct mem_cgroup *memcg = container_of(to_rcu_work(work),
5373+
struct mem_cgroup,
5374+
percpu_stats_rwork);
5375+
int node;
5376+
5377+
if (cmpxchg(&memcg->percpu_stats_disabled, PERCPU_STATS_DISABLED,
5378+
PERCPU_STATS_FLUSHING) != PERCPU_STATS_DISABLED) {
5379+
static DEFINE_RATELIMIT_STATE(_rs,
5380+
DEFAULT_RATELIMIT_INTERVAL,
5381+
DEFAULT_RATELIMIT_BURST);
5382+
5383+
if (__ratelimit(&_rs))
5384+
WARN(1, "percpu_stats_free_rwork_fn() called more than once!\n");
5385+
return;
5386+
}
5387+
5388+
cgroup_rstat_flush_hold(memcg->css.cgroup);
5389+
WRITE_ONCE(memcg->percpu_stats_disabled, PERCPU_STATS_FLUSHED);
5390+
cgroup_rstat_flush_release();
5391+
5392+
for_each_node(node) {
5393+
struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
5394+
5395+
if (pn)
5396+
free_percpu(pn->lruvec_stats_percpu);
5397+
}
5398+
free_percpu(memcg->vmstats_percpu);
5399+
WRITE_ONCE(memcg->percpu_stats_disabled, PERCPU_STATS_FREED);
5400+
css_put(&memcg->css);
5401+
}
5402+
5403+
static void memcg_percpu_stats_disable(struct mem_cgroup *memcg)
5404+
{
5405+
/*
5406+
* Block memcg from being freed before percpu_stats_free_rwork_fn()
5407+
* is called. css_get() will succeed before a potential final
5408+
* css_put() in mem_cgroup_id_put().
5409+
*/
5410+
css_get(&memcg->css);
5411+
mem_cgroup_id_put(memcg);
5412+
memcg->percpu_stats_disabled = PERCPU_STATS_DISABLED;
5413+
INIT_RCU_WORK(&memcg->percpu_stats_rwork, percpu_stats_free_rwork_fn);
5414+
queue_rcu_work(system_wq, &memcg->percpu_stats_rwork);
5415+
}
5416+
5417+
static inline bool memcg_percpu_stats_flushed(struct mem_cgroup *memcg)
5418+
{
5419+
return memcg->percpu_stats_disabled >= PERCPU_STATS_FLUSHED;
5420+
}
5421+
53215422
static struct cgroup_subsys_state * __ref
53225423
mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
53235424
{
@@ -5417,7 +5518,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
54175518

54185519
drain_all_stock(memcg);
54195520

5420-
mem_cgroup_id_put(memcg);
5521+
memcg_percpu_stats_disable(memcg);
54215522
}
54225523

54235524
static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
@@ -5486,6 +5587,9 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
54865587
long delta, v;
54875588
int i, nid;
54885589

5590+
if (memcg_percpu_stats_flushed(memcg))
5591+
return;
5592+
54895593
statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
54905594

54915595
for (i = 0; i < MEMCG_NR_STAT; i++) {
@@ -6981,6 +7085,7 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug)
69817085
static void uncharge_batch(const struct uncharge_gather *ug)
69827086
{
69837087
unsigned long flags;
7088+
struct mem_cgroup *memcg;
69847089

69857090
if (ug->nr_memory) {
69867091
page_counter_uncharge(&ug->memcg->memory, ug->nr_memory);
@@ -6991,10 +7096,12 @@ static void uncharge_batch(const struct uncharge_gather *ug)
69917096
memcg_oom_recover(ug->memcg);
69927097
}
69937098

7099+
memcg = percpu_stats_memcg(ug->memcg, NULL);
7100+
69947101
local_irq_save(flags);
6995-
__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
6996-
__this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory);
6997-
memcg_check_events(ug->memcg, ug->nid);
7102+
__count_memcg_events(memcg, PGPGOUT, ug->pgpgout);
7103+
__this_cpu_add(memcg->vmstats_percpu->nr_page_events, ug->nr_memory);
7104+
memcg_check_events(memcg, ug->nid);
69987105
local_irq_restore(flags);
69997106

70007107
/* drop reference from uncharge_folio */

0 commit comments

Comments
 (0)