Skip to content

Commit 993049c

Browse files
Chen Ridonggregkh
authored andcommitted
cgroup: split cgroup_destroy_wq into 3 workqueues
[ Upstream commit 79f919a ] A hung task can occur during [1] LTP cgroup testing when repeatedly mounting/unmounting perf_event and net_prio controllers with systemd.unified_cgroup_hierarchy=1. The hang manifests in cgroup_lock_and_drain_offline() during root destruction. Related case: cgroup_fj_function_perf_event cgroup_fj_function.sh perf_event cgroup_fj_function_net_prio cgroup_fj_function.sh net_prio Call Trace: cgroup_lock_and_drain_offline+0x14c/0x1e8 cgroup_destroy_root+0x3c/0x2c0 css_free_rwork_fn+0x248/0x338 process_one_work+0x16c/0x3b8 worker_thread+0x22c/0x3b0 kthread+0xec/0x100 ret_from_fork+0x10/0x20 Root Cause: CPU0 CPU1 mount perf_event umount net_prio cgroup1_get_tree cgroup_kill_sb rebind_subsystems // root destruction enqueues // cgroup_destroy_wq // kill all perf_event css // one perf_event css A is dying // css A offline enqueues cgroup_destroy_wq // root destruction will be executed first css_free_rwork_fn cgroup_destroy_root cgroup_lock_and_drain_offline // some perf descendants are dying // cgroup_destroy_wq max_active = 1 // waiting for css A to die Problem scenario: 1. CPU0 mounts perf_event (rebind_subsystems) 2. CPU1 unmounts net_prio (cgroup_kill_sb), queuing root destruction work 3. A dying perf_event CSS gets queued for offline after root destruction 4. Root destruction waits for offline completion, but offline work is blocked behind root destruction in cgroup_destroy_wq (max_active=1) Solution: Split cgroup_destroy_wq into three dedicated workqueues: cgroup_offline_wq – Handles CSS offline operations cgroup_release_wq – Manages resource release cgroup_free_wq – Performs final memory deallocation This separation eliminates blocking in the CSS free path while waiting for offline operations to complete. [1] https://github.com/linux-test-project/ltp/blob/master/runtest/controllers Fixes: 334c367 ("cgroup: reimplement rebind_subsystems() using cgroup_apply_control() and friends") Reported-by: Gao Yingjie <gaoyingjie@uniontech.com> Signed-off-by: Chen Ridong <chenridong@huawei.com> Suggested-by: Teju Heo <tj@kernel.org> Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Sasha Levin <sashal@kernel.org>
1 parent ea6838b commit 993049c

File tree

1 file changed

+36
-7
lines changed

1 file changed

+36
-7
lines changed

kernel/cgroup/cgroup.c

Lines changed: 36 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -123,8 +123,31 @@ DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
123123
* of concurrent destructions. Use a separate workqueue so that cgroup
124124
* destruction work items don't end up filling up max_active of system_wq
125125
* which may lead to deadlock.
126+
*
127+
* A cgroup destruction should enqueue work sequentially to:
128+
* cgroup_offline_wq: use for css offline work
129+
* cgroup_release_wq: use for css release work
130+
* cgroup_free_wq: use for free work
131+
*
132+
* Rationale for using separate workqueues:
133+
* The cgroup root free work may depend on completion of other css offline
134+
* operations. If all tasks were enqueued to a single workqueue, this could
135+
* create a deadlock scenario where:
136+
* - Free work waits for other css offline work to complete.
137+
* - But other css offline work is queued after free work in the same queue.
138+
*
139+
* Example deadlock scenario with single workqueue (cgroup_destroy_wq):
140+
* 1. umount net_prio
141+
* 2. net_prio root destruction enqueues work to cgroup_destroy_wq (CPUx)
142+
* 3. perf_event CSS A offline enqueues work to same cgroup_destroy_wq (CPUx)
143+
* 4. net_prio cgroup_destroy_root->cgroup_lock_and_drain_offline.
144+
* 5. net_prio root destruction blocks waiting for perf_event CSS A offline,
145+
* which can never complete as it's behind in the same queue and
146+
* workqueue's max_active is 1.
126147
*/
127-
static struct workqueue_struct *cgroup_destroy_wq;
148+
static struct workqueue_struct *cgroup_offline_wq;
149+
static struct workqueue_struct *cgroup_release_wq;
150+
static struct workqueue_struct *cgroup_free_wq;
128151

129152
/* generate an array of cgroup subsystem pointers */
130153
#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
@@ -5444,7 +5467,7 @@ static void css_release_work_fn(struct work_struct *work)
54445467
cgroup_unlock();
54455468

54465469
INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
5447-
queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
5470+
queue_rcu_work(cgroup_free_wq, &css->destroy_rwork);
54485471
}
54495472

54505473
static void css_release(struct percpu_ref *ref)
@@ -5453,7 +5476,7 @@ static void css_release(struct percpu_ref *ref)
54535476
container_of(ref, struct cgroup_subsys_state, refcnt);
54545477

54555478
INIT_WORK(&css->destroy_work, css_release_work_fn);
5456-
queue_work(cgroup_destroy_wq, &css->destroy_work);
5479+
queue_work(cgroup_release_wq, &css->destroy_work);
54575480
}
54585481

54595482
static void init_and_link_css(struct cgroup_subsys_state *css,
@@ -5575,7 +5598,7 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
55755598
err_free_css:
55765599
list_del_rcu(&css->rstat_css_node);
55775600
INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
5578-
queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
5601+
queue_rcu_work(cgroup_free_wq, &css->destroy_rwork);
55795602
return ERR_PTR(err);
55805603
}
55815604

@@ -5811,7 +5834,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
58115834

58125835
if (atomic_dec_and_test(&css->online_cnt)) {
58135836
INIT_WORK(&css->destroy_work, css_killed_work_fn);
5814-
queue_work(cgroup_destroy_wq, &css->destroy_work);
5837+
queue_work(cgroup_offline_wq, &css->destroy_work);
58155838
}
58165839
}
58175840

@@ -6183,8 +6206,14 @@ static int __init cgroup_wq_init(void)
61836206
* We would prefer to do this in cgroup_init() above, but that
61846207
* is called before init_workqueues(): so leave this until after.
61856208
*/
6186-
cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
6187-
BUG_ON(!cgroup_destroy_wq);
6209+
cgroup_offline_wq = alloc_workqueue("cgroup_offline", 0, 1);
6210+
BUG_ON(!cgroup_offline_wq);
6211+
6212+
cgroup_release_wq = alloc_workqueue("cgroup_release", 0, 1);
6213+
BUG_ON(!cgroup_release_wq);
6214+
6215+
cgroup_free_wq = alloc_workqueue("cgroup_free", 0, 1);
6216+
BUG_ON(!cgroup_free_wq);
61886217
return 0;
61896218
}
61906219
core_initcall(cgroup_wq_init);

0 commit comments

Comments
 (0)