Skip to content

Commit 0ae45f6

Browse files
tytsoAl Viro
authored andcommitted
vfs: add support for a lazytime mount option
Add a new mount option which enables a new "lazytime" mode. This mode causes atime, mtime, and ctime updates to only be made to the in-memory version of the inode. The on-disk times will only get updated when (a) if the inode needs to be updated for some non-time related change, (b) if userspace calls fsync(), syncfs() or sync(), or (c) just before an undeleted inode is evicted from memory. This is OK according to POSIX because there are no guarantees after a crash unless userspace explicitly requests via a fsync(2) call. For workloads which feature a large number of random write to a preallocated file, the lazytime mount option significantly reduces writes to the inode table. The repeated 4k writes to a single block will result in undesirable stress on flash devices and SMR disk drives. Even on conventional HDD's, the repeated writes to the inode table block will trigger Adjacent Track Interference (ATI) remediation latencies, which very negatively impact long tail latencies --- which is a very big deal for web serving tiers (for example). Google-Bug-Id: 18297052 Signed-off-by: Theodore Ts'o <tytso@mit.edu> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
1 parent e36f014 commit 0ae45f6

File tree

13 files changed

+186
-35
lines changed

13 files changed

+186
-35
lines changed

fs/ext4/inode.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4840,11 +4840,17 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
48404840
* If the inode is marked synchronous, we don't honour that here - doing
48414841
* so would cause a commit on atime updates, which we don't bother doing.
48424842
* We handle synchronous inodes at the highest possible level.
4843+
*
4844+
* If only the I_DIRTY_TIME flag is set, we can skip everything. If
4845+
* I_DIRTY_TIME and I_DIRTY_SYNC is set, the only inode fields we need
4846+
* to copy into the on-disk inode structure are the timestamp files.
48434847
*/
48444848
void ext4_dirty_inode(struct inode *inode, int flags)
48454849
{
48464850
handle_t *handle;
48474851

4852+
if (flags == I_DIRTY_TIME)
4853+
return;
48484854
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
48494855
if (IS_ERR(handle))
48504856
goto out;

fs/fs-writeback.c

Lines changed: 51 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -247,28 +247,41 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
247247
return ret;
248248
}
249249

250+
#define EXPIRE_DIRTY_ATIME 0x0001
251+
250252
/*
251253
* Move expired (dirtied before work->older_than_this) dirty inodes from
252254
* @delaying_queue to @dispatch_queue.
253255
*/
254256
static int move_expired_inodes(struct list_head *delaying_queue,
255257
struct list_head *dispatch_queue,
258+
int flags,
256259
struct wb_writeback_work *work)
257260
{
261+
unsigned long *older_than_this = NULL;
262+
unsigned long expire_time;
258263
LIST_HEAD(tmp);
259264
struct list_head *pos, *node;
260265
struct super_block *sb = NULL;
261266
struct inode *inode;
262267
int do_sb_sort = 0;
263268
int moved = 0;
264269

270+
if ((flags & EXPIRE_DIRTY_ATIME) == 0)
271+
older_than_this = work->older_than_this;
272+
else if ((work->reason == WB_REASON_SYNC) == 0) {
273+
expire_time = jiffies - (HZ * 86400);
274+
older_than_this = &expire_time;
275+
}
265276
while (!list_empty(delaying_queue)) {
266277
inode = wb_inode(delaying_queue->prev);
267-
if (work->older_than_this &&
268-
inode_dirtied_after(inode, *work->older_than_this))
278+
if (older_than_this &&
279+
inode_dirtied_after(inode, *older_than_this))
269280
break;
270281
list_move(&inode->i_wb_list, &tmp);
271282
moved++;
283+
if (flags & EXPIRE_DIRTY_ATIME)
284+
set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);
272285
if (sb_is_blkdev_sb(inode->i_sb))
273286
continue;
274287
if (sb && sb != inode->i_sb)
@@ -309,9 +322,12 @@ static int move_expired_inodes(struct list_head *delaying_queue,
309322
static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
310323
{
311324
int moved;
325+
312326
assert_spin_locked(&wb->list_lock);
313327
list_splice_init(&wb->b_more_io, &wb->b_io);
314-
moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work);
328+
moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
329+
moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
330+
EXPIRE_DIRTY_ATIME, work);
315331
trace_writeback_queue_io(wb, work, moved);
316332
}
317333

@@ -435,6 +451,8 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
435451
* updates after data IO completion.
436452
*/
437453
redirty_tail(inode, wb);
454+
} else if (inode->i_state & I_DIRTY_TIME) {
455+
list_move(&inode->i_wb_list, &wb->b_dirty_time);
438456
} else {
439457
/* The inode is clean. Remove from writeback lists. */
440458
list_del_init(&inode->i_wb_list);
@@ -481,7 +499,13 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
481499
spin_lock(&inode->i_lock);
482500

483501
dirty = inode->i_state & I_DIRTY;
484-
inode->i_state &= ~I_DIRTY;
502+
if (((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) &&
503+
(inode->i_state & I_DIRTY_TIME)) ||
504+
(inode->i_state & I_DIRTY_TIME_EXPIRED)) {
505+
dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;
506+
trace_writeback_lazytime(inode);
507+
}
508+
inode->i_state &= ~dirty;
485509

486510
/*
487511
* Paired with smp_mb() in __mark_inode_dirty(). This allows
@@ -501,8 +525,10 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
501525

502526
spin_unlock(&inode->i_lock);
503527

528+
if (dirty & I_DIRTY_TIME)
529+
mark_inode_dirty_sync(inode);
504530
/* Don't write the inode if only I_DIRTY_PAGES was set */
505-
if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
531+
if (dirty & ~I_DIRTY_PAGES) {
506532
int err = write_inode(inode, wbc);
507533
if (ret == 0)
508534
ret = err;
@@ -550,7 +576,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
550576
* make sure inode is on some writeback list and leave it there unless
551577
* we have completely cleaned the inode.
552578
*/
553-
if (!(inode->i_state & I_DIRTY) &&
579+
if (!(inode->i_state & I_DIRTY_ALL) &&
554580
(wbc->sync_mode != WB_SYNC_ALL ||
555581
!mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
556582
goto out;
@@ -565,7 +591,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
565591
* If inode is clean, remove it from writeback lists. Otherwise don't
566592
* touch it. See comment above for explanation.
567593
*/
568-
if (!(inode->i_state & I_DIRTY))
594+
if (!(inode->i_state & I_DIRTY_ALL))
569595
list_del_init(&inode->i_wb_list);
570596
spin_unlock(&wb->list_lock);
571597
inode_sync_complete(inode);
@@ -707,7 +733,7 @@ static long writeback_sb_inodes(struct super_block *sb,
707733
wrote += write_chunk - wbc.nr_to_write;
708734
spin_lock(&wb->list_lock);
709735
spin_lock(&inode->i_lock);
710-
if (!(inode->i_state & I_DIRTY))
736+
if (!(inode->i_state & I_DIRTY_ALL))
711737
wrote++;
712738
requeue_inode(inode, wb, &wbc);
713739
inode_sync_complete(inode);
@@ -1145,40 +1171,52 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
11451171
* page->mapping->host, so the page-dirtying time is recorded in the internal
11461172
* blockdev inode.
11471173
*/
1174+
#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
11481175
void __mark_inode_dirty(struct inode *inode, int flags)
11491176
{
11501177
struct super_block *sb = inode->i_sb;
11511178
struct backing_dev_info *bdi = NULL;
1179+
int dirtytime;
1180+
1181+
trace_writeback_mark_inode_dirty(inode, flags);
11521182

11531183
/*
11541184
* Don't do this for I_DIRTY_PAGES - that doesn't actually
11551185
* dirty the inode itself
11561186
*/
1157-
if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
1187+
if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_TIME)) {
11581188
trace_writeback_dirty_inode_start(inode, flags);
11591189

11601190
if (sb->s_op->dirty_inode)
11611191
sb->s_op->dirty_inode(inode, flags);
11621192

11631193
trace_writeback_dirty_inode(inode, flags);
11641194
}
1195+
if (flags & I_DIRTY_INODE)
1196+
flags &= ~I_DIRTY_TIME;
1197+
dirtytime = flags & I_DIRTY_TIME;
11651198

11661199
/*
11671200
* Paired with smp_mb() in __writeback_single_inode() for the
11681201
* following lockless i_state test. See there for details.
11691202
*/
11701203
smp_mb();
11711204

1172-
if ((inode->i_state & flags) == flags)
1205+
if (((inode->i_state & flags) == flags) ||
1206+
(dirtytime && (inode->i_state & I_DIRTY_INODE)))
11731207
return;
11741208

11751209
if (unlikely(block_dump))
11761210
block_dump___mark_inode_dirty(inode);
11771211

11781212
spin_lock(&inode->i_lock);
1213+
if (dirtytime && (inode->i_state & I_DIRTY_INODE))
1214+
goto out_unlock_inode;
11791215
if ((inode->i_state & flags) != flags) {
11801216
const int was_dirty = inode->i_state & I_DIRTY;
11811217

1218+
if (flags & I_DIRTY_INODE)
1219+
inode->i_state &= ~I_DIRTY_TIME;
11821220
inode->i_state |= flags;
11831221

11841222
/*
@@ -1225,8 +1263,10 @@ void __mark_inode_dirty(struct inode *inode, int flags)
12251263
}
12261264

12271265
inode->dirtied_when = jiffies;
1228-
list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
1266+
list_move(&inode->i_wb_list, dirtytime ?
1267+
&bdi->wb.b_dirty_time : &bdi->wb.b_dirty);
12291268
spin_unlock(&bdi->wb.list_lock);
1269+
trace_writeback_dirty_inode_enqueue(inode);
12301270

12311271
if (wakeup_bdi)
12321272
bdi_wakeup_thread_delayed(bdi);

fs/gfs2/file.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -655,7 +655,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
655655
{
656656
struct address_space *mapping = file->f_mapping;
657657
struct inode *inode = mapping->host;
658-
int sync_state = inode->i_state & I_DIRTY;
658+
int sync_state = inode->i_state & I_DIRTY_ALL;
659659
struct gfs2_inode *ip = GFS2_I(inode);
660660
int ret = 0, ret1 = 0;
661661

@@ -668,7 +668,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
668668
if (!gfs2_is_jdata(ip))
669669
sync_state &= ~I_DIRTY_PAGES;
670670
if (datasync)
671-
sync_state &= ~I_DIRTY_SYNC;
671+
sync_state &= ~(I_DIRTY_SYNC | I_DIRTY_TIME);
672672

673673
if (sync_state) {
674674
ret = sync_inode_metadata(inode, 1);

fs/inode.c

Lines changed: 40 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include <linux/buffer_head.h> /* for inode_has_buffers */
1919
#include <linux/ratelimit.h>
2020
#include <linux/list_lru.h>
21+
#include <trace/events/writeback.h>
2122
#include "internal.h"
2223

2324
/*
@@ -30,7 +31,7 @@
3031
* inode_sb_list_lock protects:
3132
* sb->s_inodes, inode->i_sb_list
3233
* bdi->wb.list_lock protects:
33-
* bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
34+
* bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_wb_list
3435
* inode_hash_lock protects:
3536
* inode_hashtable, inode->i_hash
3637
*
@@ -416,7 +417,8 @@ static void inode_lru_list_add(struct inode *inode)
416417
*/
417418
void inode_add_lru(struct inode *inode)
418419
{
419-
if (!(inode->i_state & (I_DIRTY | I_SYNC | I_FREEING | I_WILL_FREE)) &&
420+
if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC |
421+
I_FREEING | I_WILL_FREE)) &&
420422
!atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE)
421423
inode_lru_list_add(inode);
422424
}
@@ -647,7 +649,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
647649
spin_unlock(&inode->i_lock);
648650
continue;
649651
}
650-
if (inode->i_state & I_DIRTY && !kill_dirty) {
652+
if (inode->i_state & I_DIRTY_ALL && !kill_dirty) {
651653
spin_unlock(&inode->i_lock);
652654
busy = 1;
653655
continue;
@@ -1432,11 +1434,20 @@ static void iput_final(struct inode *inode)
14321434
*/
14331435
void iput(struct inode *inode)
14341436
{
1435-
if (inode) {
1436-
BUG_ON(inode->i_state & I_CLEAR);
1437-
1438-
if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock))
1439-
iput_final(inode);
1437+
if (!inode)
1438+
return;
1439+
BUG_ON(inode->i_state & I_CLEAR);
1440+
retry:
1441+
if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
1442+
if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
1443+
atomic_inc(&inode->i_count);
1444+
inode->i_state &= ~I_DIRTY_TIME;
1445+
spin_unlock(&inode->i_lock);
1446+
trace_writeback_lazytime_iput(inode);
1447+
mark_inode_dirty_sync(inode);
1448+
goto retry;
1449+
}
1450+
iput_final(inode);
14401451
}
14411452
}
14421453
EXPORT_SYMBOL(iput);
@@ -1495,14 +1506,9 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
14951506
return 0;
14961507
}
14971508

1498-
/*
1499-
* This does the actual work of updating an inodes time or version. Must have
1500-
* had called mnt_want_write() before calling this.
1501-
*/
1502-
static int update_time(struct inode *inode, struct timespec *time, int flags)
1509+
int generic_update_time(struct inode *inode, struct timespec *time, int flags)
15031510
{
1504-
if (inode->i_op->update_time)
1505-
return inode->i_op->update_time(inode, time, flags);
1511+
int iflags = I_DIRTY_TIME;
15061512

15071513
if (flags & S_ATIME)
15081514
inode->i_atime = *time;
@@ -1512,9 +1518,27 @@ static int update_time(struct inode *inode, struct timespec *time, int flags)
15121518
inode->i_ctime = *time;
15131519
if (flags & S_MTIME)
15141520
inode->i_mtime = *time;
1515-
mark_inode_dirty_sync(inode);
1521+
1522+
if (!(inode->i_sb->s_flags & MS_LAZYTIME) || (flags & S_VERSION))
1523+
iflags |= I_DIRTY_SYNC;
1524+
__mark_inode_dirty(inode, iflags);
15161525
return 0;
15171526
}
1527+
EXPORT_SYMBOL(generic_update_time);
1528+
1529+
/*
1530+
* This does the actual work of updating an inodes time or version. Must have
1531+
* had called mnt_want_write() before calling this.
1532+
*/
1533+
static int update_time(struct inode *inode, struct timespec *time, int flags)
1534+
{
1535+
int (*update_time)(struct inode *, struct timespec *, int);
1536+
1537+
update_time = inode->i_op->update_time ? inode->i_op->update_time :
1538+
generic_update_time;
1539+
1540+
return update_time(inode, time, flags);
1541+
}
15181542

15191543
/**
15201544
* touch_atime - update the access time

fs/jfs/file.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3939
return rc;
4040

4141
mutex_lock(&inode->i_mutex);
42-
if (!(inode->i_state & I_DIRTY) ||
42+
if (!(inode->i_state & I_DIRTY_ALL) ||
4343
(datasync && !(inode->i_state & I_DIRTY_DATASYNC))) {
4444
/* Make sure committed changes hit the disk */
4545
jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1);

fs/libfs.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -948,7 +948,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end,
948948

949949
mutex_lock(&inode->i_mutex);
950950
ret = sync_mapping_buffers(inode->i_mapping);
951-
if (!(inode->i_state & I_DIRTY))
951+
if (!(inode->i_state & I_DIRTY_ALL))
952952
goto out;
953953
if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
954954
goto out;

fs/proc_namespace.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ static int show_sb_opts(struct seq_file *m, struct super_block *sb)
4444
{ MS_SYNCHRONOUS, ",sync" },
4545
{ MS_DIRSYNC, ",dirsync" },
4646
{ MS_MANDLOCK, ",mand" },
47+
{ MS_LAZYTIME, ",lazytime" },
4748
{ 0, NULL }
4849
};
4950
const struct proc_fs_info *fs_infop;

fs/sync.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,8 +177,16 @@ SYSCALL_DEFINE1(syncfs, int, fd)
177177
*/
178178
int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
179179
{
180+
struct inode *inode = file->f_mapping->host;
181+
180182
if (!file->f_op->fsync)
181183
return -EINVAL;
184+
if (!datasync && (inode->i_state & I_DIRTY_TIME)) {
185+
spin_lock(&inode->i_lock);
186+
inode->i_state &= ~I_DIRTY_TIME;
187+
spin_unlock(&inode->i_lock);
188+
mark_inode_dirty_sync(inode);
189+
}
182190
return file->f_op->fsync(file, start, end, datasync);
183191
}
184192
EXPORT_SYMBOL(vfs_fsync_range);

include/linux/backing-dev.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ struct bdi_writeback {
5555
struct list_head b_dirty; /* dirty inodes */
5656
struct list_head b_io; /* parked for writeback */
5757
struct list_head b_more_io; /* parked for more writeback */
58+
struct list_head b_dirty_time; /* time stamps are dirty */
5859
spinlock_t list_lock; /* protects the b_* lists */
5960
};
6061

0 commit comments

Comments
 (0)