Skip to content

Commit e01facf

Browse files
shroffnigregkh
authored andcommitted
block: fix potential deadlock while running nr_hw_queue update
[ Upstream commit 04225d1 ] Move scheduler tags (sched_tags) allocation and deallocation outside both the ->elevator_lock and ->freeze_lock when updating nr_hw_queues. This change breaks the dependency chain from the percpu allocator lock to the elevator lock, helping to prevent potential deadlocks, as observed in the reported lockdep splat[1]. This commit introduces batch allocation and deallocation helpers for sched_tags, which are now used from within __blk_mq_update_nr_hw_queues routine while iterating through the tagset. With this change, all sched_tags memory management is handled entirely outside the ->elevator_lock and the ->freeze_lock context, thereby eliminating the lock dependency that could otherwise manifest during nr_hw_queues updates. [1] https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/ Reported-by: Stefan Haberland <sth@linux.ibm.com> Closes: https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/ Reviewed-by: Ming Lei <ming.lei@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Hannes Reinecke <hare@suse.de> Signed-off-by: Nilay Shroff <nilay@linux.ibm.com> Link: https://lore.kernel.org/r/20250730074614.2537382-4-nilay@linux.ibm.com Signed-off-by: Jens Axboe <axboe@kernel.dk> Stable-dep-of: 2d82f3b ("blk-mq: fix lockdep warning in __blk_mq_update_nr_hw_queues") Signed-off-by: Sasha Levin <sashal@kernel.org>
1 parent 58567d8 commit e01facf

File tree

5 files changed

+89
-15
lines changed

5 files changed

+89
-15
lines changed

block/blk-mq-sched.c

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -427,6 +427,32 @@ void blk_mq_free_sched_tags(struct elevator_tags *et,
427427
kfree(et);
428428
}
429429

430+
void blk_mq_free_sched_tags_batch(struct xarray *et_table,
431+
struct blk_mq_tag_set *set)
432+
{
433+
struct request_queue *q;
434+
struct elevator_tags *et;
435+
436+
lockdep_assert_held_write(&set->update_nr_hwq_lock);
437+
438+
list_for_each_entry(q, &set->tag_list, tag_set_list) {
439+
/*
440+
* Accessing q->elevator without holding q->elevator_lock is
441+
* safe because we're holding here set->update_nr_hwq_lock in
442+
* the writer context. So, scheduler update/switch code (which
443+
* acquires the same lock but in the reader context) can't run
444+
* concurrently.
445+
*/
446+
if (q->elevator) {
447+
et = xa_load(et_table, q->id);
448+
if (unlikely(!et))
449+
WARN_ON_ONCE(1);
450+
else
451+
blk_mq_free_sched_tags(et, set);
452+
}
453+
}
454+
}
455+
430456
struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
431457
unsigned int nr_hw_queues)
432458
{
@@ -477,6 +503,45 @@ struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
477503
return NULL;
478504
}
479505

506+
int blk_mq_alloc_sched_tags_batch(struct xarray *et_table,
507+
struct blk_mq_tag_set *set, unsigned int nr_hw_queues)
508+
{
509+
struct request_queue *q;
510+
struct elevator_tags *et;
511+
gfp_t gfp = GFP_NOIO | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
512+
513+
lockdep_assert_held_write(&set->update_nr_hwq_lock);
514+
515+
list_for_each_entry(q, &set->tag_list, tag_set_list) {
516+
/*
517+
* Accessing q->elevator without holding q->elevator_lock is
518+
* safe because we're holding here set->update_nr_hwq_lock in
519+
* the writer context. So, scheduler update/switch code (which
520+
* acquires the same lock but in the reader context) can't run
521+
* concurrently.
522+
*/
523+
if (q->elevator) {
524+
et = blk_mq_alloc_sched_tags(set, nr_hw_queues);
525+
if (!et)
526+
goto out_unwind;
527+
if (xa_insert(et_table, q->id, et, gfp))
528+
goto out_free_tags;
529+
}
530+
}
531+
return 0;
532+
out_free_tags:
533+
blk_mq_free_sched_tags(et, set);
534+
out_unwind:
535+
list_for_each_entry_continue_reverse(q, &set->tag_list, tag_set_list) {
536+
if (q->elevator) {
537+
et = xa_load(et_table, q->id);
538+
if (et)
539+
blk_mq_free_sched_tags(et, set);
540+
}
541+
}
542+
return -ENOMEM;
543+
}
544+
480545
/* caller must have a reference to @e, will grab another one if successful */
481546
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e,
482547
struct elevator_tags *et)

block/blk-mq-sched.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,12 @@ void blk_mq_sched_free_rqs(struct request_queue *q);
2525

2626
struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
2727
unsigned int nr_hw_queues);
28+
int blk_mq_alloc_sched_tags_batch(struct xarray *et_table,
29+
struct blk_mq_tag_set *set, unsigned int nr_hw_queues);
2830
void blk_mq_free_sched_tags(struct elevator_tags *et,
2931
struct blk_mq_tag_set *set);
32+
void blk_mq_free_sched_tags_batch(struct xarray *et_table,
33+
struct blk_mq_tag_set *set);
3034

3135
static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
3236
{

block/blk-mq.c

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4972,12 +4972,13 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
49724972
* Switch back to the elevator type stored in the xarray.
49734973
*/
49744974
static void blk_mq_elv_switch_back(struct request_queue *q,
4975-
struct xarray *elv_tbl)
4975+
struct xarray *elv_tbl, struct xarray *et_tbl)
49764976
{
49774977
struct elevator_type *e = xa_load(elv_tbl, q->id);
4978+
struct elevator_tags *t = xa_load(et_tbl, q->id);
49784979

49794980
/* The elv_update_nr_hw_queues unfreezes the queue. */
4980-
elv_update_nr_hw_queues(q, e);
4981+
elv_update_nr_hw_queues(q, e, t);
49814982

49824983
/* Drop the reference acquired in blk_mq_elv_switch_none. */
49834984
if (e)
@@ -5029,7 +5030,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
50295030
int prev_nr_hw_queues = set->nr_hw_queues;
50305031
unsigned int memflags;
50315032
int i;
5032-
struct xarray elv_tbl;
5033+
struct xarray elv_tbl, et_tbl;
50335034

50345035
lockdep_assert_held(&set->tag_list_lock);
50355036

@@ -5042,6 +5043,10 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
50425043

50435044
memflags = memalloc_noio_save();
50445045

5046+
xa_init(&et_tbl);
5047+
if (blk_mq_alloc_sched_tags_batch(&et_tbl, set, nr_hw_queues) < 0)
5048+
goto out_memalloc_restore;
5049+
50455050
xa_init(&elv_tbl);
50465051

50475052
list_for_each_entry(q, &set->tag_list, tag_set_list) {
@@ -5085,7 +5090,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
50855090
switch_back:
50865091
/* The blk_mq_elv_switch_back unfreezes queue for us. */
50875092
list_for_each_entry(q, &set->tag_list, tag_set_list)
5088-
blk_mq_elv_switch_back(q, &elv_tbl);
5093+
blk_mq_elv_switch_back(q, &elv_tbl, &et_tbl);
50895094

50905095
list_for_each_entry(q, &set->tag_list, tag_set_list) {
50915096
blk_mq_sysfs_register_hctxs(q);
@@ -5096,7 +5101,8 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
50965101
}
50975102

50985103
xa_destroy(&elv_tbl);
5099-
5104+
xa_destroy(&et_tbl);
5105+
out_memalloc_restore:
51005106
memalloc_noio_restore(memflags);
51015107

51025108
/* Free the excess tags when nr_hw_queues shrink. */

block/blk.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include "blk-crypto-internal.h"
1313

1414
struct elevator_type;
15+
struct elevator_tags;
1516

1617
#define BLK_DEV_MAX_SECTORS (LLONG_MAX >> 9)
1718
#define BLK_MIN_SEGMENT_SIZE 4096
@@ -322,7 +323,8 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
322323

323324
bool blk_insert_flush(struct request *rq);
324325

325-
void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e);
326+
void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e,
327+
struct elevator_tags *t);
326328
void elevator_set_default(struct request_queue *q);
327329
void elevator_set_none(struct request_queue *q);
328330

block/elevator.c

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -705,7 +705,8 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
705705
* The I/O scheduler depends on the number of hardware queues, this forces a
706706
* reattachment when nr_hw_queues changes.
707707
*/
708-
void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e)
708+
void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e,
709+
struct elevator_tags *t)
709710
{
710711
struct blk_mq_tag_set *set = q->tag_set;
711712
struct elv_change_ctx ctx = {};
@@ -715,25 +716,21 @@ void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e)
715716

716717
if (e && !blk_queue_dying(q) && blk_queue_registered(q)) {
717718
ctx.name = e->elevator_name;
718-
ctx.et = blk_mq_alloc_sched_tags(set, set->nr_hw_queues);
719-
if (!ctx.et) {
720-
WARN_ON_ONCE(1);
721-
goto unfreeze;
722-
}
719+
ctx.et = t;
720+
723721
mutex_lock(&q->elevator_lock);
724722
/* force to reattach elevator after nr_hw_queue is updated */
725723
ret = elevator_switch(q, &ctx);
726724
mutex_unlock(&q->elevator_lock);
727725
}
728-
unfreeze:
729726
blk_mq_unfreeze_queue_nomemrestore(q);
730727
if (!ret)
731728
WARN_ON_ONCE(elevator_change_done(q, &ctx));
732729
/*
733730
* Free sched tags if it's allocated but we couldn't switch elevator.
734731
*/
735-
if (ctx.et && !ctx.new)
736-
blk_mq_free_sched_tags(ctx.et, set);
732+
if (t && !ctx.new)
733+
blk_mq_free_sched_tags(t, set);
737734
}
738735

739736
/*

0 commit comments

Comments
 (0)