mm/memcg: Free percpu stats memory of dying memcg's

Waiman-Long · Waiman-Long · commit be640870964f · 2024-11-25T16:47:24.000-05:00
JIRA: https://issues.redhat.com/browse/RHEL-67445 Upstream Status: RHEL-only For systems with large number of CPUs, the majority of the memory consumed by the mem_cgroup structure is actually the percpu stats memory. When a large number of memory cgroups are continuously created and destroyed (like in a container host), it is possible that more and more mem_cgroup structures remained in the dying state holding up increasing amount of percpu memory. We can't free up the memory of the dying mem_cgroup structures due to active references mainly from pages in the page cache. However, the percpu stats memory allocated to that mem_cgroup is a different story. As of v6.12 kernel, there are 2 main sets of percpu stat counters in the mem_cgroup structure and the associated mem_cgroup_per_node structure. - vmstats_percpu (2424 bytes, in struct mem_cgroup) - lruvec_stats_percpu (1920 bytes, in struct mem_cgroup_per_node) When using cgroup v1, there is also a small events_percpu stat counter (24 bytes). Upstream hasn't decided on the best way to handle dying memory cgroups yet. See https://lwn.net/Articles/932070/ for more information. It looks like a final solution may still need some more time. This patch is a workaround by freeing the percpu stats memory (except the small v1's events_percpu) associated with a dying memory cgroup. This will mostly eliminates the percpu memory increase problem, but we will still see increase in slab memory consumption associated with the dying memory cgroups. As a workaround, it is not likely to be accepted upstream, but a lot of RHEL customers are seeing this percpu memory increase problem. A new percpu_stats_disabled variable is added to keep track of the state of the percpu stats memory. If the variable is set, percpu stats updates will be disabled for that particular memcg and forwarded to a nearest ancestor memcg that is online. The only exception is memcg_rstat_updated() which will only be called after the memcg has been properly updated. The disabling, flushing and freeing of the percpu stats memory is a multi-step process. The percpu_stats_disabled variable is set to MEMCG_PERCPU_STATS_DISABLED first when the memcg is being set to an offline state. At this point, the cgroup filesystem control files corresponding to the offline cgroups is being removed and will no longer be visible in user space. After a grace period with the help of rcu_work, no task should be reading or updating percpu stats at that point. The percpu_stats_disabled variable is then atomically set to PERCPU_STATS_FLUSHING before flushing out the percpu stats and changing its state to PERCPU_STATS_FLUSHED. The percpu memory is then freed and the state is changed to PERCPU_STATS_FREED. This will greatly reduce the amount of memory held up by dying memory cgroups. For a compiled RHEL10 x86-64 kernel running cgroup v2 on a relatively simple 2-socket 16 cores per socket system with HT on, the memory consumption of the composite mem_cgroup structure will be about 138,984 bytes which is almost 136 kBytes. With a bigger 8-socket 32 cores per socket system with HT on, the memory consumption will be about 2,606,056 bytes which is almost 2.5 MBytes. After getting rid of the percpu stats memory, the memory consumptions will be about 5,864 and 17,384 bytes respectively. So it is a lot of memory saving (95.8% and 99.3%) especially for systems with large number of CPUs. This patch does introduce a bit of performance overhead when doing memcg stat update especially __mod_memcg_lruvec_state(). This RHEL-only patch will be reverted once the upstream fix is finalized and merged into RHEL10. Signed-off-by: Waiman Long <longman@redhat.com>
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
@@ -23,6 +23,7 @@
 #include <linux/writeback.h>
 #include <linux/page-flags.h>
 #include <linux/shrinker.h>
+#include <linux/rh_kabi.h>
 
 struct mem_cgroup;
 struct obj_cgroup;
@@ -104,6 +105,7 @@ struct mem_cgroup_per_node {
 	unsigned long		usage_in_excess;/* Set to the value by which */
 						/* the soft limit is exceeded*/
 	bool			on_tree;
+	RH_KABI_FILL_HOLE(unsigned short nid)
 #else
 	CACHELINE_PADDING(_pad1_);
 #endif
@@ -322,6 +324,12 @@ struct mem_cgroup {
 	struct list_head event_list;
 	spinlock_t event_list_lock;
 #endif /* CONFIG_MEMCG_V1 */
+	/*
+	 * Disable percpu stats when offline, flush and free them after one
+	 * grace period.
+	 */
+	RH_KABI_EXTEND(int		percpu_stats_disabled)
+	RH_KABI_EXTEND(struct rcu_work	percpu_stats_rwork)
 
 	struct mem_cgroup_per_node *nodeinfo[];
 };
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
@@ -95,6 +95,14 @@ static bool cgroup_memory_nobpf __ro_after_init;
 static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
 #endif
 
+enum percpu_stats_state {
+	PERCPU_STATS_ACTIVE = 0,
+	PERCPU_STATS_DISABLED,
+	PERCPU_STATS_FLUSHING,
+	PERCPU_STATS_FLUSHED,
+	PERCPU_STATS_FREED
+};
+
 static inline bool task_is_dying(void)
 {
 	return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
@@ -666,6 +674,30 @@ static int memcg_state_val_in_pages(int idx, int val)
 		return max(val * unit / PAGE_SIZE, 1UL);
 }
 
+/*
+ * Return the active percpu stats memcg and optionally mem_cgroup_per_node.
+ *
+ * When percpu_stats_disabled, the percpu stats update is transferred to
+ * its parent.
+ */
+static __always_inline struct mem_cgroup *
+percpu_stats_memcg(struct mem_cgroup *memcg, struct mem_cgroup_per_node **pn)
+{
+	if (likely(!memcg->percpu_stats_disabled))
+		return memcg;
+
+	do {
+		memcg = parent_mem_cgroup(memcg);
+	} while (memcg->percpu_stats_disabled);
+
+	if (pn) {
+		unsigned int nid = (*pn)->nid;
+
+		*pn = memcg->nodeinfo[nid];
+	}
+	return memcg;
+}
+
 /**
  * __mod_memcg_state - update cgroup memory statistics
  * @memcg: the memory cgroup
@@ -683,6 +715,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
 	if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
 		return;
 
+	memcg = percpu_stats_memcg(memcg, NULL);
 	__this_cpu_add(memcg->vmstats_percpu->state[i], val);
 	memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
 }
@@ -716,7 +749,7 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
 		return;
 
 	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
-	memcg = pn->memcg;
+	memcg = percpu_stats_memcg(pn->memcg, &pn);
 
 	/*
 	 * The caller from rmap relies on disabled preemption because they never
@@ -831,6 +864,7 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
 	if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
 		return;
 
+	memcg = percpu_stats_memcg(memcg, NULL);
 	memcg_stats_lock();
 	__this_cpu_add(memcg->vmstats_percpu->events[i], count);
 	memcg_rstat_updated(memcg, count);
@@ -3437,6 +3471,7 @@ static bool alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
 
 	lruvec_init(&pn->lruvec);
 	pn->memcg = memcg;
+	pn->nid = node;
 
 	memcg->nodeinfo[node] = pn;
 	return true;
@@ -3453,7 +3488,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
 	if (!pn)
 		return;
 
-	free_percpu(pn->lruvec_stats_percpu);
+	//free_percpu(pn->lruvec_stats_percpu);
 	kfree(pn->lruvec_stats);
 	kfree(pn);
 }
@@ -3468,7 +3503,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
 		free_mem_cgroup_per_node_info(memcg, node);
 	memcg1_free_events(memcg);
 	kfree(memcg->vmstats);
-	free_percpu(memcg->vmstats_percpu);
+	//free_percpu(memcg->vmstats_percpu);
 	kfree(memcg);
 }
 
@@ -3553,6 +3588,61 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
 	return ERR_PTR(error);
 }
 
+/*
+ * Flush and free the percpu stats
+ */
+static void percpu_stats_free_rwork_fn(struct work_struct *work)
+{
+	struct mem_cgroup *memcg = container_of(to_rcu_work(work),
+						struct mem_cgroup,
+						percpu_stats_rwork);
+	int node;
+
+	if (cmpxchg(&memcg->percpu_stats_disabled, PERCPU_STATS_DISABLED,
+		    PERCPU_STATS_FLUSHING) != PERCPU_STATS_DISABLED) {
+		static DEFINE_RATELIMIT_STATE(_rs,
+					      DEFAULT_RATELIMIT_INTERVAL,
+					      DEFAULT_RATELIMIT_BURST);
+
+		if (__ratelimit(&_rs))
+			WARN(1, "%s called more than once!\n", __func__);
+		return;
+	}
+
+	cgroup_rstat_flush_hold(memcg->css.cgroup);
+	WRITE_ONCE(memcg->percpu_stats_disabled, PERCPU_STATS_FLUSHED);
+	cgroup_rstat_flush_release(memcg->css.cgroup);
+
+	for_each_node(node) {
+		struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
+
+		if (pn)
+			free_percpu(pn->lruvec_stats_percpu);
+	}
+	free_percpu(memcg->vmstats_percpu);
+	WRITE_ONCE(memcg->percpu_stats_disabled, PERCPU_STATS_FREED);
+	css_put(&memcg->css);
+}
+
+static void memcg_percpu_stats_disable(struct mem_cgroup *memcg)
+{
+	/*
+	 * Block memcg from being freed before percpu_stats_free_rwork_fn()
+	 * is called. css_get() will succeed before a potential final
+	 * css_put() in mem_cgroup_id_put().
+	 */
+	css_get(&memcg->css);
+	mem_cgroup_id_put(memcg);
+	memcg->percpu_stats_disabled = PERCPU_STATS_DISABLED;
+	INIT_RCU_WORK(&memcg->percpu_stats_rwork, percpu_stats_free_rwork_fn);
+	queue_rcu_work(system_wq, &memcg->percpu_stats_rwork);
+}
+
+static inline bool memcg_percpu_stats_flushed(struct mem_cgroup *memcg)
+{
+	return memcg->percpu_stats_disabled >= PERCPU_STATS_FLUSHED;
+}
+
 static struct cgroup_subsys_state * __ref
 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 {
@@ -3666,7 +3756,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 
 	drain_all_stock(memcg);
 
-	mem_cgroup_id_put(memcg);
+	memcg_percpu_stats_disable(memcg);
 }
 
 static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
@@ -3741,6 +3831,9 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
 	long delta, delta_cpu, v;
 	int i, nid;
 
+	if (memcg_percpu_stats_flushed(memcg))
+		return;
+
 	statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
 
 	for (i = 0; i < MEMCG_VMSTAT_SIZE; i++) {