Skip to content

Commit 54efd50

Browse files
koverstreetaxboe
authored andcommitted
block: make generic_make_request handle arbitrarily sized bios
The way the block layer is currently written, it goes to great lengths to avoid having to split bios; upper layer code (such as bio_add_page()) checks what the underlying device can handle and tries to always create bios that don't need to be split. But this approach becomes unwieldy and eventually breaks down with stacked devices and devices with dynamic limits, and it adds a lot of complexity. If the block layer could split bios as needed, we could eliminate a lot of complexity elsewhere - particularly in stacked drivers. Code that creates bios can then create whatever size bios are convenient, and more importantly stacked drivers don't have to deal with both their own bio size limitations and the limitations of the (potentially multiple) devices underneath them. In the future this will let us delete merge_bvec_fn and a bunch of other code. We do this by adding calls to blk_queue_split() to the various make_request functions that need it - a few can already handle arbitrary size bios. Note that we add the call _after_ any call to blk_queue_bounce(); this means that blk_queue_split() and blk_recalc_rq_segments() don't need to be concerned with bouncing affecting segment merging. Some make_request_fn() callbacks were simple enough to audit and verify they don't need blk_queue_split() calls. The skipped ones are: * nfhd_make_request (arch/m68k/emu/nfblock.c) * axon_ram_make_request (arch/powerpc/sysdev/axonram.c) * simdisk_make_request (arch/xtensa/platforms/iss/simdisk.c) * brd_make_request (ramdisk - drivers/block/brd.c) * mtip_submit_request (drivers/block/mtip32xx/mtip32xx.c) * loop_make_request * null_queue_bio * bcache's make_request fns Some others are almost certainly safe to remove now, but will be left for future patches. Cc: Jens Axboe <axboe@kernel.dk> Cc: Christoph Hellwig <hch@infradead.org> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Ming Lei <ming.lei@canonical.com> Cc: Neil Brown <neilb@suse.de> Cc: Alasdair Kergon <agk@redhat.com> Cc: Mike Snitzer <snitzer@redhat.com> Cc: dm-devel@redhat.com Cc: Lars Ellenberg <drbd-dev@lists.linbit.com> Cc: drbd-user@lists.linbit.com Cc: Jiri Kosina <jkosina@suse.cz> Cc: Geoff Levand <geoff@infradead.org> Cc: Jim Paris <jim@jtan.com> Cc: Philip Kelleher <pjk1939@linux.vnet.ibm.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Nitin Gupta <ngupta@vflare.org> Cc: Oleg Drokin <oleg.drokin@intel.com> Cc: Andreas Dilger <andreas.dilger@intel.com> Acked-by: NeilBrown <neilb@suse.de> (for the 'md/md.c' bits) Acked-by: Mike Snitzer <snitzer@redhat.com> Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com> Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com> [dpark: skip more mq-based drivers, resolve merge conflicts, etc.] Signed-off-by: Dongsu Park <dpark@posteo.net> Signed-off-by: Ming Lin <ming.l@ssi.samsung.com> Signed-off-by: Jens Axboe <axboe@fb.com>
1 parent 4160989 commit 54efd50

File tree

16 files changed

+192
-22
lines changed

16 files changed

+192
-22
lines changed

block/blk-core.c

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -643,6 +643,10 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
643643
if (q->id < 0)
644644
goto fail_q;
645645

646+
q->bio_split = bioset_create(BIO_POOL_SIZE, 0);
647+
if (!q->bio_split)
648+
goto fail_id;
649+
646650
q->backing_dev_info.ra_pages =
647651
(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
648652
q->backing_dev_info.capabilities = BDI_CAP_CGROUP_WRITEBACK;
@@ -651,7 +655,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
651655

652656
err = bdi_init(&q->backing_dev_info);
653657
if (err)
654-
goto fail_id;
658+
goto fail_split;
655659

656660
setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
657661
laptop_mode_timer_fn, (unsigned long) q);
@@ -693,6 +697,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
693697

694698
fail_bdi:
695699
bdi_destroy(&q->backing_dev_info);
700+
fail_split:
701+
bioset_free(q->bio_split);
696702
fail_id:
697703
ida_simple_remove(&blk_queue_ida, q->id);
698704
fail_q:
@@ -1610,6 +1616,8 @@ static void blk_queue_bio(struct request_queue *q, struct bio *bio)
16101616
struct request *req;
16111617
unsigned int request_count = 0;
16121618

1619+
blk_queue_split(q, &bio, q->bio_split);
1620+
16131621
/*
16141622
* low level driver can indicate that it wants pages above a
16151623
* certain limit bounced to low memory (ie for highmem, or even
@@ -1832,15 +1840,6 @@ generic_make_request_checks(struct bio *bio)
18321840
goto end_io;
18331841
}
18341842

1835-
if (likely(bio_is_rw(bio) &&
1836-
nr_sectors > queue_max_hw_sectors(q))) {
1837-
printk(KERN_ERR "bio too big device %s (%u > %u)\n",
1838-
bdevname(bio->bi_bdev, b),
1839-
bio_sectors(bio),
1840-
queue_max_hw_sectors(q));
1841-
goto end_io;
1842-
}
1843-
18441843
part = bio->bi_bdev->bd_part;
18451844
if (should_fail_request(part, bio->bi_iter.bi_size) ||
18461845
should_fail_request(&part_to_disk(part)->part0,

block/blk-merge.c

Lines changed: 149 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,158 @@
99

1010
#include "blk.h"
1111

12+
static struct bio *blk_bio_discard_split(struct request_queue *q,
13+
struct bio *bio,
14+
struct bio_set *bs)
15+
{
16+
unsigned int max_discard_sectors, granularity;
17+
int alignment;
18+
sector_t tmp;
19+
unsigned split_sectors;
20+
21+
/* Zero-sector (unknown) and one-sector granularities are the same. */
22+
granularity = max(q->limits.discard_granularity >> 9, 1U);
23+
24+
max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);
25+
max_discard_sectors -= max_discard_sectors % granularity;
26+
27+
if (unlikely(!max_discard_sectors)) {
28+
/* XXX: warn */
29+
return NULL;
30+
}
31+
32+
if (bio_sectors(bio) <= max_discard_sectors)
33+
return NULL;
34+
35+
split_sectors = max_discard_sectors;
36+
37+
/*
38+
* If the next starting sector would be misaligned, stop the discard at
39+
* the previous aligned sector.
40+
*/
41+
alignment = (q->limits.discard_alignment >> 9) % granularity;
42+
43+
tmp = bio->bi_iter.bi_sector + split_sectors - alignment;
44+
tmp = sector_div(tmp, granularity);
45+
46+
if (split_sectors > tmp)
47+
split_sectors -= tmp;
48+
49+
return bio_split(bio, split_sectors, GFP_NOIO, bs);
50+
}
51+
52+
static struct bio *blk_bio_write_same_split(struct request_queue *q,
53+
struct bio *bio,
54+
struct bio_set *bs)
55+
{
56+
if (!q->limits.max_write_same_sectors)
57+
return NULL;
58+
59+
if (bio_sectors(bio) <= q->limits.max_write_same_sectors)
60+
return NULL;
61+
62+
return bio_split(bio, q->limits.max_write_same_sectors, GFP_NOIO, bs);
63+
}
64+
65+
static struct bio *blk_bio_segment_split(struct request_queue *q,
66+
struct bio *bio,
67+
struct bio_set *bs)
68+
{
69+
struct bio *split;
70+
struct bio_vec bv, bvprv;
71+
struct bvec_iter iter;
72+
unsigned seg_size = 0, nsegs = 0;
73+
int prev = 0;
74+
75+
struct bvec_merge_data bvm = {
76+
.bi_bdev = bio->bi_bdev,
77+
.bi_sector = bio->bi_iter.bi_sector,
78+
.bi_size = 0,
79+
.bi_rw = bio->bi_rw,
80+
};
81+
82+
bio_for_each_segment(bv, bio, iter) {
83+
if (q->merge_bvec_fn &&
84+
q->merge_bvec_fn(q, &bvm, &bv) < (int) bv.bv_len)
85+
goto split;
86+
87+
bvm.bi_size += bv.bv_len;
88+
89+
if (bvm.bi_size >> 9 > queue_max_sectors(q))
90+
goto split;
91+
92+
/*
93+
* If the queue doesn't support SG gaps and adding this
94+
* offset would create a gap, disallow it.
95+
*/
96+
if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS) &&
97+
prev && bvec_gap_to_prev(&bvprv, bv.bv_offset))
98+
goto split;
99+
100+
if (prev && blk_queue_cluster(q)) {
101+
if (seg_size + bv.bv_len > queue_max_segment_size(q))
102+
goto new_segment;
103+
if (!BIOVEC_PHYS_MERGEABLE(&bvprv, &bv))
104+
goto new_segment;
105+
if (!BIOVEC_SEG_BOUNDARY(q, &bvprv, &bv))
106+
goto new_segment;
107+
108+
seg_size += bv.bv_len;
109+
bvprv = bv;
110+
prev = 1;
111+
continue;
112+
}
113+
new_segment:
114+
if (nsegs == queue_max_segments(q))
115+
goto split;
116+
117+
nsegs++;
118+
bvprv = bv;
119+
prev = 1;
120+
seg_size = bv.bv_len;
121+
}
122+
123+
return NULL;
124+
split:
125+
split = bio_clone_bioset(bio, GFP_NOIO, bs);
126+
127+
split->bi_iter.bi_size -= iter.bi_size;
128+
bio->bi_iter = iter;
129+
130+
if (bio_integrity(bio)) {
131+
bio_integrity_advance(bio, split->bi_iter.bi_size);
132+
bio_integrity_trim(split, 0, bio_sectors(split));
133+
}
134+
135+
return split;
136+
}
137+
138+
void blk_queue_split(struct request_queue *q, struct bio **bio,
139+
struct bio_set *bs)
140+
{
141+
struct bio *split;
142+
143+
if ((*bio)->bi_rw & REQ_DISCARD)
144+
split = blk_bio_discard_split(q, *bio, bs);
145+
else if ((*bio)->bi_rw & REQ_WRITE_SAME)
146+
split = blk_bio_write_same_split(q, *bio, bs);
147+
else
148+
split = blk_bio_segment_split(q, *bio, q->bio_split);
149+
150+
if (split) {
151+
bio_chain(split, *bio);
152+
generic_make_request(*bio);
153+
*bio = split;
154+
}
155+
}
156+
EXPORT_SYMBOL(blk_queue_split);
157+
12158
static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
13159
struct bio *bio,
14160
bool no_sg_merge)
15161
{
16162
struct bio_vec bv, bvprv = { NULL };
17-
int cluster, high, highprv = 1;
163+
int cluster, prev = 0;
18164
unsigned int seg_size, nr_phys_segs;
19165
struct bio *fbio, *bbio;
20166
struct bvec_iter iter;
@@ -36,7 +182,6 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
36182
cluster = blk_queue_cluster(q);
37183
seg_size = 0;
38184
nr_phys_segs = 0;
39-
high = 0;
40185
for_each_bio(bio) {
41186
bio_for_each_segment(bv, bio, iter) {
42187
/*
@@ -46,13 +191,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
46191
if (no_sg_merge)
47192
goto new_segment;
48193

49-
/*
50-
* the trick here is making sure that a high page is
51-
* never considered part of another segment, since
52-
* that might change with the bounce page.
53-
*/
54-
high = page_to_pfn(bv.bv_page) > queue_bounce_pfn(q);
55-
if (!high && !highprv && cluster) {
194+
if (prev && cluster) {
56195
if (seg_size + bv.bv_len
57196
> queue_max_segment_size(q))
58197
goto new_segment;
@@ -72,8 +211,8 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
72211

73212
nr_phys_segs++;
74213
bvprv = bv;
214+
prev = 1;
75215
seg_size = bv.bv_len;
76-
highprv = high;
77216
}
78217
bbio = bio;
79218
}

block/blk-mq.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1287,6 +1287,8 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
12871287
return;
12881288
}
12891289

1290+
blk_queue_split(q, &bio, q->bio_split);
1291+
12901292
if (!is_flush_fua && !blk_queue_nomerges(q) &&
12911293
blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
12921294
return;
@@ -1372,6 +1374,8 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
13721374
return;
13731375
}
13741376

1377+
blk_queue_split(q, &bio, q->bio_split);
1378+
13751379
if (!is_flush_fua && !blk_queue_nomerges(q) &&
13761380
blk_attempt_plug_merge(q, bio, &request_count, NULL))
13771381
return;

block/blk-sysfs.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -561,6 +561,9 @@ static void blk_release_queue(struct kobject *kobj)
561561

562562
blk_trace_shutdown(q);
563563

564+
if (q->bio_split)
565+
bioset_free(q->bio_split);
566+
564567
ida_simple_remove(&blk_queue_ida, q->id);
565568
call_rcu(&q->rcu_head, blk_free_queue_rcu);
566569
}

drivers/block/drbd/drbd_req.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1499,6 +1499,8 @@ void drbd_make_request(struct request_queue *q, struct bio *bio)
14991499
struct drbd_device *device = (struct drbd_device *) q->queuedata;
15001500
unsigned long start_jif;
15011501

1502+
blk_queue_split(q, &bio, q->bio_split);
1503+
15021504
start_jif = jiffies;
15031505

15041506
/*

drivers/block/pktcdvd.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2447,6 +2447,10 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio)
24472447
char b[BDEVNAME_SIZE];
24482448
struct bio *split;
24492449

2450+
blk_queue_bounce(q, &bio);
2451+
2452+
blk_queue_split(q, &bio, q->bio_split);
2453+
24502454
pd = q->queuedata;
24512455
if (!pd) {
24522456
pr_err("%s incorrect request queue\n",
@@ -2477,8 +2481,6 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio)
24772481
goto end_io;
24782482
}
24792483

2480-
blk_queue_bounce(q, &bio);
2481-
24822484
do {
24832485
sector_t zone = get_zone(bio->bi_iter.bi_sector, pd);
24842486
sector_t last_zone = get_zone(bio_end_sector(bio) - 1, pd);

drivers/block/ps3vram.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -606,6 +606,8 @@ static void ps3vram_make_request(struct request_queue *q, struct bio *bio)
606606

607607
dev_dbg(&dev->core, "%s\n", __func__);
608608

609+
blk_queue_split(q, &bio, q->bio_split);
610+
609611
spin_lock_irq(&priv->lock);
610612
busy = !bio_list_empty(&priv->list);
611613
bio_list_add(&priv->list, bio);

drivers/block/rsxx/dev.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,8 @@ static void rsxx_make_request(struct request_queue *q, struct bio *bio)
151151
struct rsxx_bio_meta *bio_meta;
152152
int st = -EINVAL;
153153

154+
blk_queue_split(q, &bio, q->bio_split);
155+
154156
might_sleep();
155157

156158
if (!card)

drivers/block/umem.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -531,6 +531,8 @@ static void mm_make_request(struct request_queue *q, struct bio *bio)
531531
(unsigned long long)bio->bi_iter.bi_sector,
532532
bio->bi_iter.bi_size);
533533

534+
blk_queue_split(q, &bio, q->bio_split);
535+
534536
spin_lock_irq(&card->lock);
535537
*card->biotail = bio;
536538
bio->bi_next = NULL;

drivers/block/zram/zram_drv.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -900,6 +900,8 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio)
900900
if (unlikely(!zram_meta_get(zram)))
901901
goto error;
902902

903+
blk_queue_split(queue, &bio, queue->bio_split);
904+
903905
if (!valid_io_request(zram, bio->bi_iter.bi_sector,
904906
bio->bi_iter.bi_size)) {
905907
atomic64_inc(&zram->stats.invalid_io);

0 commit comments

Comments
 (0)