Merge: mm/memcg: Free percpu stats memory of dying memcg's

jstancek · jstancek · commit e23754e5b1a1 · 2023-06-28T07:52:45.000+02:00
MR: https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-9/-/merge_requests/2580 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2176388 Upstream Status: RHEL-only For systems with large number of CPUs, the majority of the memory consumed by the mem_cgroup structure is actually the percpu stats memory. When a large number of memory cgroups are continuously created and destroyed (like in a container host), it is possible that more and more mem_cgroup structures remained in the dying state holding up increasing amount of percpu memory. We can't free up the memory of the dying mem_cgroup structure due to active references mainly from pages in the page cache. However, the percpu stats memory allocated to that mem_cgroup is a different story. There are 2 sets of percpu stat counters in the mem_cgroup structure and the associated mem_cgroup_per_node structure. - vmstats_percpu (struct mem_cgroup) - lruvec_stat_percpu (struct mem_cgroup_per_node) There is discussion upstream about the best way to handle dying memory cgroups that hang around indefinitely, mostly due to shared memory. See https://lwn.net/Articles/932070/ for more information. It looks like a final solution may still need some more time. This patch is a workaround by freeing the percpu stats memory associated with a dying memory cgroup. This will eliminates the percpu memory increase problem, but we will still see increase in slab memory consumption associated with the dying memory cgroups. As a workaround, it is not likely to be accepted upstream, but a lot of RHEL customers are seeing this percpu memory increase problem. A new percpu_stats_disabled variable is added to keep track of the state of the percpu stats memory. If the variable is set, percpu stats update will be disabled for that particular memcg and forwarded to a parent memcg. The disabling, flushing and freeing of the percpu stats memory is a multi-step process. The percpu_stats_disabled variable is set to MEMCG_PERCPU_STATS_DISABLED first when the memcg is being set to an offline state. At this point, the cgroup filesystem control files corresponding to the offline cgroups is being removed and will no longer be visible in user space. After a grace period with the help of rcu_work, no task should be reading or updating percpu stats at that point. The percpu_stats_disabled variable is then atomically set to PERCPU_STATS_FLUSHING before flushing out the percpu stats and changing its state to PERCPU_STATS_FLUSHED. The percpu memory is then freed and the state is changed to PERCPU_STATS_FREED. This will greatly reduce the amount of memory held up by dying memory cgroups. For the compiled RHEL9 kernel, memcg_vmstats_percpu and lruvec_stats_percpu have a size of 1080 and 672 bytes respectively. The mem_cgroup and mem_cgroup_per_node structures have a size of 2240 and 1096 bytes respectively. For a 2-socket 96-thread system, that means each dying memory cgroup use 232,704 bytes of percpu data and 3,338 bytes of memcg slab data. The percpu/slab ratio is 69. The ratio can be even higher for larger systems with many CPUs. By freeing the percpu memory, the dying memory cgroups will now consume much less memory than before. This patch does introduce a bit of performance overhead when doing memcg stat update especially __mod_memcg_lruvec_state(). This RHEL-only patch will be reverted when the upstream fix is finalized and being merged into RHEL9. Signed-off-by: Waiman Long <longman@redhat.com> Approved-by: Rafael Aquini <aquini@redhat.com> Approved-by: Aristeu Rozanski <arozansk@redhat.com> Approved-by: Phil Auld <pauld@redhat.com> Signed-off-by: Jan Stancek <jstancek@redhat.com>
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
@@ -134,6 +134,8 @@ struct mem_cgroup_per_node {
 	unsigned long		usage_in_excess;/* Set to the value by which */
 						/* the soft limit is exceeded*/
 	bool			on_tree;
+	RH_KABI_EXTEND(unsigned short nid)
+
 	struct mem_cgroup	*memcg;		/* Back pointer, we cannot */
 						/* use container_of	   */
 };
@@ -343,6 +345,12 @@ struct mem_cgroup {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	struct deferred_split deferred_split_queue;
 #endif
+	/*
+	 * Disable percpu stats when offline, flush and free them after one
+	 * grace period.
+	 */
+	RH_KABI_EXTEND(struct rcu_work	percpu_stats_rwork)
+	RH_KABI_EXTEND(int 		percpu_stats_disabled)
 
 	struct mem_cgroup_per_node *nodeinfo[];
 };
@@ -1013,6 +1021,9 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
 		return node_page_state(lruvec_pgdat(lruvec), idx);
 
 	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
+	if (pn->memcg->percpu_stats_disabled)
+		return 0;
+
 	for_each_possible_cpu(cpu)
 		x += per_cpu(pn->lruvec_stats_percpu->state[idx], cpu);
 #ifdef CONFIG_SMP
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
@@ -214,6 +214,14 @@ enum res_type {
 	_TCP,
 };
 
+enum percpu_stats_state {
+	PERCPU_STATS_ACTIVE = 0,
+	PERCPU_STATS_DISABLED,
+	PERCPU_STATS_FLUSHING,
+	PERCPU_STATS_FLUSHED,
+	PERCPU_STATS_FREED
+};
+
 #define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
 #define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
 #define MEMFILE_ATTR(val)	((val) & 0xffff)
@@ -737,6 +745,30 @@ unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
 	return x;
 }
 
+/*
+ * Return the active percpu stats memcg and optionally mem_cgroup_per_node.
+ *
+ * When percpu_stats_disabled, the percpu stats update is transferred to
+ * its parent.
+ */
+static __always_inline struct mem_cgroup *
+percpu_stats_memcg(struct mem_cgroup *memcg, struct mem_cgroup_per_node **pn)
+{
+	if (likely(!memcg->percpu_stats_disabled))
+		return memcg;
+
+	do {
+		memcg = parent_mem_cgroup(memcg);
+	} while (memcg->percpu_stats_disabled);
+
+	if (pn) {
+		unsigned int nid = (*pn)->nid;
+
+		*pn = memcg->nodeinfo[nid];
+	}
+	return memcg;
+}
+
 /**
  * __mod_memcg_state - update cgroup memory statistics
  * @memcg: the memory cgroup
@@ -748,6 +780,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
 	if (mem_cgroup_disabled())
 		return;
 
+	memcg = percpu_stats_memcg(memcg, NULL);
 	__this_cpu_add(memcg->vmstats_percpu->state[idx], val);
 	memcg_rstat_updated(memcg, val);
 }
@@ -758,6 +791,9 @@ static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
 	long x = 0;
 	int cpu;
 
+	if (unlikely(memcg->percpu_stats_disabled))
+		return 0;
+
 	for_each_possible_cpu(cpu)
 		x += per_cpu(memcg->vmstats_percpu->state[idx], cpu);
 #ifdef CONFIG_SMP
@@ -774,7 +810,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 	struct mem_cgroup *memcg;
 
 	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
-	memcg = pn->memcg;
+	memcg = percpu_stats_memcg(pn->memcg, &pn);
 
 	/*
 	 * The caller from rmap relay on disabled preemption becase they never
@@ -838,6 +874,7 @@ void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
 
 	rcu_read_lock();
 	memcg = page_memcg(head);
+
 	/* Untracked pages have no memcg, no lruvec. Update only the node */
 	if (!memcg) {
 		rcu_read_unlock();
@@ -889,6 +926,7 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
 	if (mem_cgroup_disabled() || index < 0)
 		return;
 
+	memcg = percpu_stats_memcg(memcg, NULL);
 	memcg_stats_lock();
 	__this_cpu_add(memcg->vmstats_percpu->events[index], count);
 	memcg_rstat_updated(memcg, count);
@@ -913,6 +951,9 @@ static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
 	if (index < 0)
 		return 0;
 
+	if (unlikely(memcg->percpu_stats_disabled))
+		return 0;
+
 	for_each_possible_cpu(cpu)
 		x += per_cpu(memcg->vmstats_percpu->events[index], cpu);
 	return x;
@@ -921,6 +962,8 @@ static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 					 int nr_pages)
 {
+	memcg = percpu_stats_memcg(memcg, NULL);
+
 	/* pagein of a big page is an event. So, ignore page size */
 	if (nr_pages > 0)
 		__count_memcg_events(memcg, PGPGIN, 1);
@@ -937,6 +980,8 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
 {
 	unsigned long val, next;
 
+	memcg = percpu_stats_memcg(memcg, NULL);
+
 	val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
 	next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
 	/* from time_after() in jiffies.h */
@@ -5220,6 +5265,7 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
 
 	lruvec_init(&pn->lruvec);
 	pn->memcg = memcg;
+	pn->nid = node;
 
 	memcg->nodeinfo[node] = pn;
 	return 0;
@@ -5232,7 +5278,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
 	if (!pn)
 		return;
 
-	free_percpu(pn->lruvec_stats_percpu);
+	//free_percpu(pn->lruvec_stats_percpu);
 	kfree(pn);
 }
 
@@ -5243,7 +5289,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
 	for_each_node(node)
 		free_mem_cgroup_per_node_info(memcg, node);
 	kfree(memcg->vmstats);
-	free_percpu(memcg->vmstats_percpu);
+	//free_percpu(memcg->vmstats_percpu);
 	kfree(memcg);
 }
 
@@ -5318,6 +5364,61 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
 	return ERR_PTR(error);
 }
 
+/*
+ * Flush and free the percpu stats
+ */
+static void percpu_stats_free_rwork_fn(struct work_struct *work)
+{
+	struct mem_cgroup *memcg = container_of(to_rcu_work(work),
+						struct mem_cgroup,
+						percpu_stats_rwork);
+	int node;
+
+	if (cmpxchg(&memcg->percpu_stats_disabled, PERCPU_STATS_DISABLED,
+		    PERCPU_STATS_FLUSHING) != PERCPU_STATS_DISABLED) {
+		static DEFINE_RATELIMIT_STATE(_rs,
+					      DEFAULT_RATELIMIT_INTERVAL,
+					      DEFAULT_RATELIMIT_BURST);
+
+		if (__ratelimit(&_rs))
+			WARN(1, "percpu_stats_free_rwork_fn() called more than once!\n");
+		return;
+	}
+
+	cgroup_rstat_flush_hold(memcg->css.cgroup);
+	WRITE_ONCE(memcg->percpu_stats_disabled, PERCPU_STATS_FLUSHED);
+	cgroup_rstat_flush_release();
+
+	for_each_node(node) {
+		struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
+
+		if (pn)
+			free_percpu(pn->lruvec_stats_percpu);
+	}
+	free_percpu(memcg->vmstats_percpu);
+	WRITE_ONCE(memcg->percpu_stats_disabled, PERCPU_STATS_FREED);
+	css_put(&memcg->css);
+}
+
+static void memcg_percpu_stats_disable(struct mem_cgroup *memcg)
+{
+	/*
+	 * Block memcg from being freed before percpu_stats_free_rwork_fn()
+	 * is called. css_get() will succeed before a potential final
+	 * css_put() in mem_cgroup_id_put().
+	 */
+	css_get(&memcg->css);
+	mem_cgroup_id_put(memcg);
+	memcg->percpu_stats_disabled = PERCPU_STATS_DISABLED;
+	INIT_RCU_WORK(&memcg->percpu_stats_rwork, percpu_stats_free_rwork_fn);
+	queue_rcu_work(system_wq, &memcg->percpu_stats_rwork);
+}
+
+static inline bool memcg_percpu_stats_flushed(struct mem_cgroup *memcg)
+{
+	return memcg->percpu_stats_disabled >= PERCPU_STATS_FLUSHED;
+}
+
 static struct cgroup_subsys_state * __ref
 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 {
@@ -5417,7 +5518,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 
 	drain_all_stock(memcg);
 
-	mem_cgroup_id_put(memcg);
+	memcg_percpu_stats_disable(memcg);
 }
 
 static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
@@ -5486,6 +5587,9 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
 	long delta, v;
 	int i, nid;
 
+	if (memcg_percpu_stats_flushed(memcg))
+		return;
+
 	statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
 
 	for (i = 0; i < MEMCG_NR_STAT; i++) {
@@ -6981,6 +7085,7 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug)
 static void uncharge_batch(const struct uncharge_gather *ug)
 {
 	unsigned long flags;
+	struct mem_cgroup *memcg;
 
 	if (ug->nr_memory) {
 		page_counter_uncharge(&ug->memcg->memory, ug->nr_memory);
@@ -6991,10 +7096,12 @@ static void uncharge_batch(const struct uncharge_gather *ug)
 		memcg_oom_recover(ug->memcg);
 	}
 
+	memcg = percpu_stats_memcg(ug->memcg, NULL);
+
 	local_irq_save(flags);
-	__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
-	__this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory);
-	memcg_check_events(ug->memcg, ug->nid);
+	__count_memcg_events(memcg, PGPGOUT, ug->pgpgout);
+	__this_cpu_add(memcg->vmstats_percpu->nr_page_events, ug->nr_memory);
+	memcg_check_events(memcg, ug->nid);
 	local_irq_restore(flags);
 
 	/* drop reference from uncharge_folio */