Skip to content

Commit d89abc4

Browse files
paulmckrcugregkh
authored andcommitted
rcu-tasks: Eliminate deadlocks involving do_exit() and RCU tasks
commit 1612160 upstream. Holding a mutex across synchronize_rcu_tasks() and acquiring that same mutex in code called from do_exit() after its call to exit_tasks_rcu_start() but before its call to exit_tasks_rcu_stop() results in deadlock. This is by design, because tasks that are far enough into do_exit() are no longer present on the tasks list, making it a bit difficult for RCU Tasks to find them, let alone wait on them to do a voluntary context switch. However, such deadlocks are becoming more frequent. In addition, lockdep currently does not detect such deadlocks and they can be difficult to reproduce. In addition, if a task voluntarily context switches during that time (for example, if it blocks acquiring a mutex), then this task is in an RCU Tasks quiescent state. And with some adjustments, RCU Tasks could just as well take advantage of that fact. This commit therefore eliminates these deadlock by replacing the SRCU-based wait for do_exit() completion with per-CPU lists of tasks currently exiting. A given task will be on one of these per-CPU lists for the same period of time that this task would previously have been in the previous SRCU read-side critical section. These lists enable RCU Tasks to find the tasks that have already been removed from the tasks list, but that must nevertheless be waited upon. The RCU Tasks grace period gathers any of these do_exit() tasks that it must wait on, and adds them to the list of holdouts. Per-CPU locking and get_task_struct() are used to synchronize addition to and removal from these lists. Link: https://lore.kernel.org/all/20240118021842.290665-1-chenzhongjin@huawei.com/ Reported-by: Chen Zhongjin <chenzhongjin@huawei.com> Reported-by: Yang Jihong <yangjihong1@huawei.com> Signed-off-by: Paul E. McKenney <paulmck@kernel.org> Tested-by: Yang Jihong <yangjihong1@huawei.com> Tested-by: Chen Zhongjin <chenzhongjin@huawei.com> Reviewed-by: Frederic Weisbecker <frederic@kernel.org> Signed-off-by: Boqun Feng <boqun.feng@gmail.com> Cc: Tahera Fahimi <taherafahimi@linux.microsoft.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
1 parent 4e86206 commit d89abc4

File tree

1 file changed

+28
-16
lines changed

1 file changed

+28
-16
lines changed

kernel/rcu/tasks.h

Lines changed: 28 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -150,8 +150,6 @@ static struct rcu_tasks rt_name = \
150150
}
151151

152152
#ifdef CONFIG_TASKS_RCU
153-
/* Track exiting tasks in order to allow them to be waited for. */
154-
DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu);
155153

156154
/* Report delay in synchronize_srcu() completion in rcu_tasks_postscan(). */
157155
static void tasks_rcu_exit_srcu_stall(struct timer_list *unused);
@@ -879,10 +877,12 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
879877
// number of voluntary context switches, and add that task to the
880878
// holdout list.
881879
// rcu_tasks_postscan():
882-
// Invoke synchronize_srcu() to ensure that all tasks that were
883-
// in the process of exiting (and which thus might not know to
884-
// synchronize with this RCU Tasks grace period) have completed
885-
// exiting.
880+
// Gather per-CPU lists of tasks in do_exit() to ensure that all
881+
// tasks that were in the process of exiting (and which thus might
882+
// not know to synchronize with this RCU Tasks grace period) have
883+
// completed exiting. The synchronize_rcu() in rcu_tasks_postgp()
884+
// will take care of any tasks stuck in the non-preemptible region
885+
// of do_exit() following its call to exit_tasks_rcu_stop().
886886
// check_all_holdout_tasks(), repeatedly until holdout list is empty:
887887
// Scans the holdout list, attempting to identify a quiescent state
888888
// for each task on the list. If there is a quiescent state, the
@@ -895,8 +895,10 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
895895
// with interrupts disabled.
896896
//
897897
// For each exiting task, the exit_tasks_rcu_start() and
898-
// exit_tasks_rcu_finish() functions begin and end, respectively, the SRCU
899-
// read-side critical sections waited for by rcu_tasks_postscan().
898+
// exit_tasks_rcu_finish() functions add and remove, respectively, the
899+
// current task to a per-CPU list of tasks that rcu_tasks_postscan() must
900+
// wait on. This is necessary because rcu_tasks_postscan() must wait on
901+
// tasks that have already been removed from the global list of tasks.
900902
//
901903
// Pre-grace-period update-side code is ordered before the grace
902904
// via the raw_spin_lock.*rcu_node(). Pre-grace-period read-side code
@@ -960,9 +962,13 @@ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop)
960962
}
961963
}
962964

965+
void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func);
966+
DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks");
967+
963968
/* Processing between scanning taskslist and draining the holdout list. */
964969
static void rcu_tasks_postscan(struct list_head *hop)
965970
{
971+
int cpu;
966972
int rtsi = READ_ONCE(rcu_task_stall_info);
967973

968974
if (!IS_ENABLED(CONFIG_TINY_RCU)) {
@@ -976,9 +982,9 @@ static void rcu_tasks_postscan(struct list_head *hop)
976982
* this, divide the fragile exit path part in two intersecting
977983
* read side critical sections:
978984
*
979-
* 1) An _SRCU_ read side starting before calling exit_notify(),
980-
* which may remove the task from the tasklist, and ending after
981-
* the final preempt_disable() call in do_exit().
985+
* 1) A task_struct list addition before calling exit_notify(),
986+
* which may remove the task from the tasklist, with the
987+
* removal after the final preempt_disable() call in do_exit().
982988
*
983989
* 2) An _RCU_ read side starting with the final preempt_disable()
984990
* call in do_exit() and ending with the final call to schedule()
@@ -987,7 +993,17 @@ static void rcu_tasks_postscan(struct list_head *hop)
987993
* This handles the part 1). And postgp will handle part 2) with a
988994
* call to synchronize_rcu().
989995
*/
990-
synchronize_srcu(&tasks_rcu_exit_srcu);
996+
997+
for_each_possible_cpu(cpu) {
998+
struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rcu_tasks.rtpcpu, cpu);
999+
struct task_struct *t;
1000+
1001+
raw_spin_lock_irq_rcu_node(rtpcp);
1002+
list_for_each_entry(t, &rtpcp->rtp_exit_list, rcu_tasks_exit_list)
1003+
if (list_empty(&t->rcu_tasks_holdout_list))
1004+
rcu_tasks_pertask(t, hop);
1005+
raw_spin_unlock_irq_rcu_node(rtpcp);
1006+
}
9911007

9921008
if (!IS_ENABLED(CONFIG_TINY_RCU))
9931009
del_timer_sync(&tasks_rcu_exit_srcu_stall_timer);
@@ -1055,17 +1071,13 @@ static void rcu_tasks_postgp(struct rcu_tasks *rtp)
10551071
*
10561072
* In addition, this synchronize_rcu() waits for exiting tasks
10571073
* to complete their final preempt_disable() region of execution,
1058-
* cleaning up after synchronize_srcu(&tasks_rcu_exit_srcu),
10591074
* enforcing the whole region before tasklist removal until
10601075
* the final schedule() with TASK_DEAD state to be an RCU TASKS
10611076
* read side critical section.
10621077
*/
10631078
synchronize_rcu();
10641079
}
10651080

1066-
void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func);
1067-
DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks");
1068-
10691081
static void tasks_rcu_exit_srcu_stall(struct timer_list *unused)
10701082
{
10711083
#ifndef CONFIG_TINY_RCU

0 commit comments

Comments
 (0)