Skip to content

Commit 1ccf164

Browse files
dhowellsaxboe
authored andcommitted
block: Use iov_iter_extract_pages() and page pinning in direct-io.c
Change the old block-based direct-I/O code to use iov_iter_extract_pages() to pin user pages or leave kernel pages unpinned rather than taking refs when submitting bios. This makes use of the preceding patches to not take pins on the zero page (thereby allowing insertion of zero pages in with pinned pages) and to get additional pins on pages, allowing an extracted page to be used in multiple bios without having to re-extract it. Signed-off-by: David Howells <dhowells@redhat.com> cc: Christoph Hellwig <hch@infradead.org> cc: David Hildenbrand <david@redhat.com> cc: Lorenzo Stoakes <lstoakes@gmail.com> cc: Andrew Morton <akpm@linux-foundation.org> cc: Jens Axboe <axboe@kernel.dk> cc: Al Viro <viro@zeniv.linux.org.uk> cc: Matthew Wilcox <willy@infradead.org> cc: Jan Kara <jack@suse.cz> cc: Jeff Layton <jlayton@kernel.org> cc: Jason Gunthorpe <jgg@nvidia.com> cc: Logan Gunthorpe <logang@deltatee.com> cc: Hillf Danton <hdanton@sina.com> cc: Christian Brauner <brauner@kernel.org> cc: Linus Torvalds <torvalds@linux-foundation.org> cc: linux-fsdevel@vger.kernel.org cc: linux-block@vger.kernel.org cc: linux-kernel@vger.kernel.org cc: linux-mm@kvack.org Reviewed-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20230526214142.958751-4-dhowells@redhat.com Signed-off-by: Jens Axboe <axboe@kernel.dk>
1 parent 1101fb8 commit 1ccf164

File tree

1 file changed

+43
-29
lines changed

1 file changed

+43
-29
lines changed

fs/direct-io.c

Lines changed: 43 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,8 @@
4242
#include "internal.h"
4343

4444
/*
45-
* How many user pages to map in one call to get_user_pages(). This determines
46-
* the size of a structure in the slab cache
45+
* How many user pages to map in one call to iov_iter_extract_pages(). This
46+
* determines the size of a structure in the slab cache
4747
*/
4848
#define DIO_PAGES 64
4949

@@ -121,12 +121,13 @@ struct dio {
121121
struct inode *inode;
122122
loff_t i_size; /* i_size when submitted */
123123
dio_iodone_t *end_io; /* IO completion function */
124+
bool is_pinned; /* T if we have pins on the pages */
124125

125126
void *private; /* copy from map_bh.b_private */
126127

127128
/* BIO completion state */
128129
spinlock_t bio_lock; /* protects BIO fields below */
129-
int page_errors; /* errno from get_user_pages() */
130+
int page_errors; /* err from iov_iter_extract_pages() */
130131
int is_async; /* is IO async ? */
131132
bool defer_completion; /* defer AIO completion to workqueue? */
132133
bool should_dirty; /* if pages should be dirtied */
@@ -165,23 +166,22 @@ static inline unsigned dio_pages_present(struct dio_submit *sdio)
165166
*/
166167
static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
167168
{
169+
struct page **pages = dio->pages;
168170
const enum req_op dio_op = dio->opf & REQ_OP_MASK;
169171
ssize_t ret;
170172

171-
ret = iov_iter_get_pages2(sdio->iter, dio->pages, LONG_MAX, DIO_PAGES,
172-
&sdio->from);
173+
ret = iov_iter_extract_pages(sdio->iter, &pages, LONG_MAX,
174+
DIO_PAGES, 0, &sdio->from);
173175

174176
if (ret < 0 && sdio->blocks_available && dio_op == REQ_OP_WRITE) {
175-
struct page *page = ZERO_PAGE(0);
176177
/*
177178
* A memory fault, but the filesystem has some outstanding
178179
* mapped blocks. We need to use those blocks up to avoid
179180
* leaking stale data in the file.
180181
*/
181182
if (dio->page_errors == 0)
182183
dio->page_errors = ret;
183-
get_page(page);
184-
dio->pages[0] = page;
184+
dio->pages[0] = ZERO_PAGE(0);
185185
sdio->head = 0;
186186
sdio->tail = 1;
187187
sdio->from = 0;
@@ -201,9 +201,9 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
201201

202202
/*
203203
* Get another userspace page. Returns an ERR_PTR on error. Pages are
204-
* buffered inside the dio so that we can call get_user_pages() against a
205-
* decent number of pages, less frequently. To provide nicer use of the
206-
* L1 cache.
204+
* buffered inside the dio so that we can call iov_iter_extract_pages()
205+
* against a decent number of pages, less frequently. To provide nicer use of
206+
* the L1 cache.
207207
*/
208208
static inline struct page *dio_get_page(struct dio *dio,
209209
struct dio_submit *sdio)
@@ -219,6 +219,18 @@ static inline struct page *dio_get_page(struct dio *dio,
219219
return dio->pages[sdio->head];
220220
}
221221

222+
static void dio_pin_page(struct dio *dio, struct page *page)
223+
{
224+
if (dio->is_pinned)
225+
folio_add_pin(page_folio(page));
226+
}
227+
228+
static void dio_unpin_page(struct dio *dio, struct page *page)
229+
{
230+
if (dio->is_pinned)
231+
unpin_user_page(page);
232+
}
233+
222234
/*
223235
* dio_complete() - called when all DIO BIO I/O has been completed
224236
*
@@ -402,8 +414,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
402414
bio->bi_end_io = dio_bio_end_aio;
403415
else
404416
bio->bi_end_io = dio_bio_end_io;
405-
/* for now require references for all pages */
406-
bio_set_flag(bio, BIO_PAGE_REFFED);
417+
if (dio->is_pinned)
418+
bio_set_flag(bio, BIO_PAGE_PINNED);
407419
sdio->bio = bio;
408420
sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
409421
}
@@ -444,8 +456,9 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
444456
*/
445457
static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio)
446458
{
447-
while (sdio->head < sdio->tail)
448-
put_page(dio->pages[sdio->head++]);
459+
if (dio->is_pinned)
460+
unpin_user_pages(dio->pages + sdio->head,
461+
sdio->tail - sdio->head);
449462
}
450463

451464
/*
@@ -676,7 +689,7 @@ static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio,
676689
*
677690
* Return zero on success. Non-zero means the caller needs to start a new BIO.
678691
*/
679-
static inline int dio_bio_add_page(struct dio_submit *sdio)
692+
static inline int dio_bio_add_page(struct dio *dio, struct dio_submit *sdio)
680693
{
681694
int ret;
682695

@@ -688,7 +701,7 @@ static inline int dio_bio_add_page(struct dio_submit *sdio)
688701
*/
689702
if ((sdio->cur_page_len + sdio->cur_page_offset) == PAGE_SIZE)
690703
sdio->pages_in_io--;
691-
get_page(sdio->cur_page);
704+
dio_pin_page(dio, sdio->cur_page);
692705
sdio->final_block_in_bio = sdio->cur_page_block +
693706
(sdio->cur_page_len >> sdio->blkbits);
694707
ret = 0;
@@ -743,11 +756,11 @@ static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio,
743756
goto out;
744757
}
745758

746-
if (dio_bio_add_page(sdio) != 0) {
759+
if (dio_bio_add_page(dio, sdio) != 0) {
747760
dio_bio_submit(dio, sdio);
748761
ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
749762
if (ret == 0) {
750-
ret = dio_bio_add_page(sdio);
763+
ret = dio_bio_add_page(dio, sdio);
751764
BUG_ON(ret != 0);
752765
}
753766
}
@@ -804,13 +817,13 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
804817
*/
805818
if (sdio->cur_page) {
806819
ret = dio_send_cur_page(dio, sdio, map_bh);
807-
put_page(sdio->cur_page);
820+
dio_unpin_page(dio, sdio->cur_page);
808821
sdio->cur_page = NULL;
809822
if (ret)
810823
return ret;
811824
}
812825

813-
get_page(page); /* It is in dio */
826+
dio_pin_page(dio, page); /* It is in dio */
814827
sdio->cur_page = page;
815828
sdio->cur_page_offset = offset;
816829
sdio->cur_page_len = len;
@@ -825,7 +838,7 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
825838
ret = dio_send_cur_page(dio, sdio, map_bh);
826839
if (sdio->bio)
827840
dio_bio_submit(dio, sdio);
828-
put_page(sdio->cur_page);
841+
dio_unpin_page(dio, sdio->cur_page);
829842
sdio->cur_page = NULL;
830843
}
831844
return ret;
@@ -926,7 +939,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
926939

927940
ret = get_more_blocks(dio, sdio, map_bh);
928941
if (ret) {
929-
put_page(page);
942+
dio_unpin_page(dio, page);
930943
goto out;
931944
}
932945
if (!buffer_mapped(map_bh))
@@ -971,7 +984,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
971984

972985
/* AKPM: eargh, -ENOTBLK is a hack */
973986
if (dio_op == REQ_OP_WRITE) {
974-
put_page(page);
987+
dio_unpin_page(dio, page);
975988
return -ENOTBLK;
976989
}
977990

@@ -984,7 +997,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
984997
if (sdio->block_in_file >=
985998
i_size_aligned >> blkbits) {
986999
/* We hit eof */
987-
put_page(page);
1000+
dio_unpin_page(dio, page);
9881001
goto out;
9891002
}
9901003
zero_user(page, from, 1 << blkbits);
@@ -1024,7 +1037,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
10241037
sdio->next_block_for_io,
10251038
map_bh);
10261039
if (ret) {
1027-
put_page(page);
1040+
dio_unpin_page(dio, page);
10281041
goto out;
10291042
}
10301043
sdio->next_block_for_io += this_chunk_blocks;
@@ -1039,8 +1052,8 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
10391052
break;
10401053
}
10411054

1042-
/* Drop the ref which was taken in get_user_pages() */
1043-
put_page(page);
1055+
/* Drop the pin which was taken in get_user_pages() */
1056+
dio_unpin_page(dio, page);
10441057
}
10451058
out:
10461059
return ret;
@@ -1135,6 +1148,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
11351148
/* will be released by direct_io_worker */
11361149
inode_lock(inode);
11371150
}
1151+
dio->is_pinned = iov_iter_extract_will_pin(iter);
11381152

11391153
/* Once we sampled i_size check for reads beyond EOF */
11401154
dio->i_size = i_size_read(inode);
@@ -1259,7 +1273,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
12591273
ret2 = dio_send_cur_page(dio, &sdio, &map_bh);
12601274
if (retval == 0)
12611275
retval = ret2;
1262-
put_page(sdio.cur_page);
1276+
dio_unpin_page(dio, sdio.cur_page);
12631277
sdio.cur_page = NULL;
12641278
}
12651279
if (sdio.bio)

0 commit comments

Comments
 (0)