Skip to content

Commit 6ae330c

Browse files
rhvgoyalMiklos Szeredi
authored andcommitted
virtiofs: serialize truncate/punch_hole and dax fault path
Currently in fuse we don't seem have any lock which can serialize fault path with truncate/punch_hole path. With dax support I need one for following reasons. 1. Dax requirement DAX fault code relies on inode size being stable for the duration of fault and want to serialize with truncate/punch_hole and they explicitly mention it. static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) /* * Check whether offset isn't beyond end of file now. Caller is * supposed to hold locks serializing us with truncate / punch hole so * this is a reliable test. */ max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 2. Make sure there are no users of pages being truncated/punch_hole get_user_pages() might take references to page and then do some DMA to said pages. Filesystem might truncate those pages without knowing that a DMA is in progress or some I/O is in progress. So use dax_layout_busy_page() to make sure there are no such references and I/O is not in progress on said pages before moving ahead with truncation. 3. Limitation of kvm page fault error reporting If we are truncating file on host first and then removing mappings in guest lateter (truncate page cache etc), then this could lead to a problem with KVM. Say a mapping is in place in guest and truncation happens on host. Now if guest accesses that mapping, then host will take a fault and kvm will either exit to qemu or spin infinitely. IOW, before we do truncation on host, we need to make sure that guest inode does not have any mapping in that region or whole file. 4. virtiofs memory range reclaim Soon I will introduce the notion of being able to reclaim dax memory ranges from a fuse dax inode. There also I need to make sure that no I/O or fault is going on in the reclaimed range and nobody is using it so that range can be reclaimed without issues. Currently if we take inode lock, that serializes read/write. But it does not do anything for faults. So I add another semaphore fuse_inode->i_mmap_sem for this purpose. It can be used to serialize with faults. As of now, I am adding taking this semaphore only in dax fault path and not regular fault path because existing code does not have one. May be existing code can benefit from it as well to take care of some races, but that we can fix later if need be. For now, I am just focussing only on DAX path which is new path. Also added logic to take fuse_inode->i_mmap_sem in truncate/punch_hole/open(O_TRUNC) path to make sure file truncation and fuse dax fault are mutually exlusive and avoid all the above problems. Signed-off-by: Vivek Goyal <vgoyal@redhat.com> Cc: Dave Chinner <david@fromorbit.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
1 parent 9483e7d commit 6ae330c

File tree

5 files changed

+110
-10
lines changed

5 files changed

+110
-10
lines changed

fs/fuse/dax.c

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -495,6 +495,47 @@ static const struct iomap_ops fuse_iomap_ops = {
495495
.iomap_end = fuse_iomap_end,
496496
};
497497

498+
static void fuse_wait_dax_page(struct inode *inode)
499+
{
500+
struct fuse_inode *fi = get_fuse_inode(inode);
501+
502+
up_write(&fi->i_mmap_sem);
503+
schedule();
504+
down_write(&fi->i_mmap_sem);
505+
}
506+
507+
/* Should be called with fi->i_mmap_sem lock held exclusively */
508+
static int __fuse_dax_break_layouts(struct inode *inode, bool *retry,
509+
loff_t start, loff_t end)
510+
{
511+
struct page *page;
512+
513+
page = dax_layout_busy_page_range(inode->i_mapping, start, end);
514+
if (!page)
515+
return 0;
516+
517+
*retry = true;
518+
return ___wait_var_event(&page->_refcount,
519+
atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
520+
0, 0, fuse_wait_dax_page(inode));
521+
}
522+
523+
/* dmap_end == 0 leads to unmapping of whole file */
524+
int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start,
525+
u64 dmap_end)
526+
{
527+
bool retry;
528+
int ret;
529+
530+
do {
531+
retry = false;
532+
ret = __fuse_dax_break_layouts(inode, &retry, dmap_start,
533+
dmap_end);
534+
} while (ret == 0 && retry);
535+
536+
return ret;
537+
}
538+
498539
ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
499540
{
500541
struct inode *inode = file_inode(iocb->ki_filp);
@@ -596,10 +637,18 @@ static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf,
596637
if (write)
597638
sb_start_pagefault(sb);
598639

640+
/*
641+
* We need to serialize against not only truncate but also against
642+
* fuse dax memory range reclaim. While a range is being reclaimed,
643+
* we do not want any read/write/mmap to make progress and try
644+
* to populate page cache or access memory we are trying to free.
645+
*/
646+
down_read(&get_fuse_inode(inode)->i_mmap_sem);
599647
ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, &fuse_iomap_ops);
600648

601649
if (ret & VM_FAULT_NEEDDSYNC)
602650
ret = dax_finish_sync_fault(vmf, pe_size, pfn);
651+
up_read(&get_fuse_inode(inode)->i_mmap_sem);
603652

604653
if (write)
605654
sb_end_pagefault(sb);

fs/fuse/dir.c

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1501,6 +1501,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
15011501
loff_t oldsize;
15021502
int err;
15031503
bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode);
1504+
bool fault_blocked = false;
15041505

15051506
if (!fc->default_permissions)
15061507
attr->ia_valid |= ATTR_FORCE;
@@ -1509,6 +1510,22 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
15091510
if (err)
15101511
return err;
15111512

1513+
if (attr->ia_valid & ATTR_SIZE) {
1514+
if (WARN_ON(!S_ISREG(inode->i_mode)))
1515+
return -EIO;
1516+
is_truncate = true;
1517+
}
1518+
1519+
if (FUSE_IS_DAX(inode) && is_truncate) {
1520+
down_write(&fi->i_mmap_sem);
1521+
fault_blocked = true;
1522+
err = fuse_dax_break_layouts(inode, 0, 0);
1523+
if (err) {
1524+
up_write(&fi->i_mmap_sem);
1525+
return err;
1526+
}
1527+
}
1528+
15121529
if (attr->ia_valid & ATTR_OPEN) {
15131530
/* This is coming from open(..., ... | O_TRUNC); */
15141531
WARN_ON(!(attr->ia_valid & ATTR_SIZE));
@@ -1521,17 +1538,11 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
15211538
*/
15221539
i_size_write(inode, 0);
15231540
truncate_pagecache(inode, 0);
1524-
return 0;
1541+
goto out;
15251542
}
15261543
file = NULL;
15271544
}
15281545

1529-
if (attr->ia_valid & ATTR_SIZE) {
1530-
if (WARN_ON(!S_ISREG(inode->i_mode)))
1531-
return -EIO;
1532-
is_truncate = true;
1533-
}
1534-
15351546
/* Flush dirty data/metadata before non-truncate SETATTR */
15361547
if (is_wb && S_ISREG(inode->i_mode) &&
15371548
attr->ia_valid &
@@ -1614,13 +1625,20 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
16141625
}
16151626

16161627
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
1628+
out:
1629+
if (fault_blocked)
1630+
up_write(&fi->i_mmap_sem);
1631+
16171632
return 0;
16181633

16191634
error:
16201635
if (is_truncate)
16211636
fuse_release_nowrite(inode);
16221637

16231638
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
1639+
1640+
if (fault_blocked)
1641+
up_write(&fi->i_mmap_sem);
16241642
return err;
16251643
}
16261644

fs/fuse/file.c

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -221,22 +221,34 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
221221
bool is_wb_truncate = (file->f_flags & O_TRUNC) &&
222222
fc->atomic_o_trunc &&
223223
fc->writeback_cache;
224+
bool dax_truncate = (file->f_flags & O_TRUNC) &&
225+
fc->atomic_o_trunc && FUSE_IS_DAX(inode);
224226

225227
err = generic_file_open(inode, file);
226228
if (err)
227229
return err;
228230

229-
if (is_wb_truncate) {
231+
if (is_wb_truncate || dax_truncate) {
230232
inode_lock(inode);
231233
fuse_set_nowrite(inode);
232234
}
233235

234-
err = fuse_do_open(fc, get_node_id(inode), file, isdir);
236+
if (dax_truncate) {
237+
down_write(&get_fuse_inode(inode)->i_mmap_sem);
238+
err = fuse_dax_break_layouts(inode, 0, 0);
239+
if (err)
240+
goto out;
241+
}
235242

243+
err = fuse_do_open(fc, get_node_id(inode), file, isdir);
236244
if (!err)
237245
fuse_finish_open(inode, file);
238246

239-
if (is_wb_truncate) {
247+
out:
248+
if (dax_truncate)
249+
up_write(&get_fuse_inode(inode)->i_mmap_sem);
250+
251+
if (is_wb_truncate | dax_truncate) {
240252
fuse_release_nowrite(inode);
241253
inode_unlock(inode);
242254
}
@@ -3221,6 +3233,8 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
32213233
bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
32223234
(mode & FALLOC_FL_PUNCH_HOLE);
32233235

3236+
bool block_faults = FUSE_IS_DAX(inode) && lock_inode;
3237+
32243238
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
32253239
return -EOPNOTSUPP;
32263240

@@ -3229,6 +3243,13 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
32293243

32303244
if (lock_inode) {
32313245
inode_lock(inode);
3246+
if (block_faults) {
3247+
down_write(&fi->i_mmap_sem);
3248+
err = fuse_dax_break_layouts(inode, 0, 0);
3249+
if (err)
3250+
goto out;
3251+
}
3252+
32323253
if (mode & FALLOC_FL_PUNCH_HOLE) {
32333254
loff_t endbyte = offset + length - 1;
32343255

@@ -3278,6 +3299,9 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
32783299
if (!(mode & FALLOC_FL_KEEP_SIZE))
32793300
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
32803301

3302+
if (block_faults)
3303+
up_write(&fi->i_mmap_sem);
3304+
32813305
if (lock_inode)
32823306
inode_unlock(inode);
32833307

fs/fuse/fuse_i.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,13 @@ struct fuse_inode {
149149
/** Lock to protect write related fields */
150150
spinlock_t lock;
151151

152+
/**
153+
* Can't take inode lock in fault path (leads to circular dependency).
154+
* Introduce another semaphore which can be taken in fault path and
155+
* then other filesystem paths can take this to block faults.
156+
*/
157+
struct rw_semaphore i_mmap_sem;
158+
152159
#ifdef CONFIG_FUSE_DAX
153160
/*
154161
* Dax specific inode data
@@ -1116,6 +1123,7 @@ void fuse_free_conn(struct fuse_conn *fc);
11161123
ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to);
11171124
ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from);
11181125
int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma);
1126+
int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start, u64 dmap_end);
11191127
int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev);
11201128
void fuse_dax_conn_free(struct fuse_conn *fc);
11211129
bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi);

fs/fuse/inode.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
8585
fi->orig_ino = 0;
8686
fi->state = 0;
8787
mutex_init(&fi->mutex);
88+
init_rwsem(&fi->i_mmap_sem);
8889
spin_lock_init(&fi->lock);
8990
fi->forget = fuse_alloc_forget();
9091
if (!fi->forget)

0 commit comments

Comments
 (0)