Skip to content

Commit 55cdd0a

Browse files
Wang Jianchaotytso
authored andcommitted
ext4: get discard out of jbd2 commit kthread contex
Right now, discard is issued and waited to be completed in jbd2 commit kthread context after the logs are committed. When large amount of files are deleted and discard is flooding, jbd2 commit kthread can be blocked for long time. Then all of the metadata operations can be blocked to wait the log space. One case is the page fault path with read mm->mmap_sem held, which wants to update the file time but has to wait for the log space. When other threads in the task wants to do mmap, then write mmap_sem is blocked. Finally all of the following read mmap_sem requirements are blocked, even the ps command which need to read the /proc/pid/ -cmdline. Our monitor service which needs to read /proc/pid/cmdline used to be blocked for 5 mins. This patch frees the blocks back to buddy after commit and then do discard in a async kworker context in fstrim fashion, namely, - mark blocks to be discarded as used if they have not been allocated - do discard - mark them free After this, jbd2 commit kthread won't be blocked any more by discard and we won't get NOSPC even if the discard is slow or throttled. Link: https://marc.info/?l=linux-kernel&m=162143690731901&w=2 Suggested-by: Theodore Ts'o <tytso@mit.edu> Reviewed-by: Jan Kara <jack@suse.cz> Signed-off-by: Wang Jianchao <wangjianchao@kuaishou.com> Link: https://lore.kernel.org/r/20210830075246.12516-5-jianchao.wan9@gmail.com Signed-off-by: Theodore Ts'o <tytso@mit.edu>
1 parent b6f5558 commit 55cdd0a

File tree

2 files changed

+78
-25
lines changed

2 files changed

+78
-25
lines changed

fs/ext4/ext4.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1536,6 +1536,8 @@ struct ext4_sb_info {
15361536
unsigned int s_mb_free_pending;
15371537
struct list_head s_freed_data_list; /* List of blocks to be freed
15381538
after commit completed */
1539+
struct list_head s_discard_list;
1540+
struct work_struct s_discard_work;
15391541
struct rb_root s_mb_avg_fragment_size_root;
15401542
rwlock_t s_mb_rb_lock;
15411543
struct list_head *s_mb_largest_free_orders;

fs/ext4/mballoc.c

Lines changed: 76 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -408,6 +408,10 @@ static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
408408
static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
409409
ext4_group_t group, int cr);
410410

411+
static int ext4_try_to_trim_range(struct super_block *sb,
412+
struct ext4_buddy *e4b, ext4_grpblk_t start,
413+
ext4_grpblk_t max, ext4_grpblk_t minblocks);
414+
411415
/*
412416
* The algorithm using this percpu seq counter goes below:
413417
* 1. We sample the percpu discard_pa_seq counter before trying for block
@@ -3308,6 +3312,55 @@ static int ext4_groupinfo_create_slab(size_t size)
33083312
return 0;
33093313
}
33103314

3315+
static void ext4_discard_work(struct work_struct *work)
3316+
{
3317+
struct ext4_sb_info *sbi = container_of(work,
3318+
struct ext4_sb_info, s_discard_work);
3319+
struct super_block *sb = sbi->s_sb;
3320+
struct ext4_free_data *fd, *nfd;
3321+
struct ext4_buddy e4b;
3322+
struct list_head discard_list;
3323+
ext4_group_t grp, load_grp;
3324+
int err = 0;
3325+
3326+
INIT_LIST_HEAD(&discard_list);
3327+
spin_lock(&sbi->s_md_lock);
3328+
list_splice_init(&sbi->s_discard_list, &discard_list);
3329+
spin_unlock(&sbi->s_md_lock);
3330+
3331+
load_grp = UINT_MAX;
3332+
list_for_each_entry_safe(fd, nfd, &discard_list, efd_list) {
3333+
/*
3334+
* If filesystem is umounting or no memory, give up the discard
3335+
*/
3336+
if ((sb->s_flags & SB_ACTIVE) && !err) {
3337+
grp = fd->efd_group;
3338+
if (grp != load_grp) {
3339+
if (load_grp != UINT_MAX)
3340+
ext4_mb_unload_buddy(&e4b);
3341+
3342+
err = ext4_mb_load_buddy(sb, grp, &e4b);
3343+
if (err) {
3344+
kmem_cache_free(ext4_free_data_cachep, fd);
3345+
load_grp = UINT_MAX;
3346+
continue;
3347+
} else {
3348+
load_grp = grp;
3349+
}
3350+
}
3351+
3352+
ext4_lock_group(sb, grp);
3353+
ext4_try_to_trim_range(sb, &e4b, fd->efd_start_cluster,
3354+
fd->efd_start_cluster + fd->efd_count - 1, 1);
3355+
ext4_unlock_group(sb, grp);
3356+
}
3357+
kmem_cache_free(ext4_free_data_cachep, fd);
3358+
}
3359+
3360+
if (load_grp != UINT_MAX)
3361+
ext4_mb_unload_buddy(&e4b);
3362+
}
3363+
33113364
int ext4_mb_init(struct super_block *sb)
33123365
{
33133366
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -3376,6 +3429,8 @@ int ext4_mb_init(struct super_block *sb)
33763429
spin_lock_init(&sbi->s_md_lock);
33773430
sbi->s_mb_free_pending = 0;
33783431
INIT_LIST_HEAD(&sbi->s_freed_data_list);
3432+
INIT_LIST_HEAD(&sbi->s_discard_list);
3433+
INIT_WORK(&sbi->s_discard_work, ext4_discard_work);
33793434

33803435
sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
33813436
sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
@@ -3474,6 +3529,14 @@ int ext4_mb_release(struct super_block *sb)
34743529
struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
34753530
int count;
34763531

3532+
if (test_opt(sb, DISCARD)) {
3533+
/*
3534+
* wait the discard work to drain all of ext4_free_data
3535+
*/
3536+
flush_work(&sbi->s_discard_work);
3537+
WARN_ON_ONCE(!list_empty(&sbi->s_discard_list));
3538+
}
3539+
34773540
if (sbi->s_group_info) {
34783541
for (i = 0; i < ngroups; i++) {
34793542
cond_resched();
@@ -3596,7 +3659,6 @@ static void ext4_free_data_in_buddy(struct super_block *sb,
35963659
put_page(e4b.bd_bitmap_page);
35973660
}
35983661
ext4_unlock_group(sb, entry->efd_group);
3599-
kmem_cache_free(ext4_free_data_cachep, entry);
36003662
ext4_mb_unload_buddy(&e4b);
36013663

36023664
mb_debug(sb, "freed %d blocks in %d structures\n", count,
@@ -3611,10 +3673,9 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
36113673
{
36123674
struct ext4_sb_info *sbi = EXT4_SB(sb);
36133675
struct ext4_free_data *entry, *tmp;
3614-
struct bio *discard_bio = NULL;
36153676
struct list_head freed_data_list;
36163677
struct list_head *cut_pos = NULL;
3617-
int err;
3678+
bool wake;
36183679

36193680
INIT_LIST_HEAD(&freed_data_list);
36203681

@@ -3629,30 +3690,20 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
36293690
cut_pos);
36303691
spin_unlock(&sbi->s_md_lock);
36313692

3632-
if (test_opt(sb, DISCARD)) {
3633-
list_for_each_entry(entry, &freed_data_list, efd_list) {
3634-
err = ext4_issue_discard(sb, entry->efd_group,
3635-
entry->efd_start_cluster,
3636-
entry->efd_count,
3637-
&discard_bio);
3638-
if (err && err != -EOPNOTSUPP) {
3639-
ext4_msg(sb, KERN_WARNING, "discard request in"
3640-
" group:%d block:%d count:%d failed"
3641-
" with %d", entry->efd_group,
3642-
entry->efd_start_cluster,
3643-
entry->efd_count, err);
3644-
} else if (err == -EOPNOTSUPP)
3645-
break;
3646-
}
3693+
list_for_each_entry(entry, &freed_data_list, efd_list)
3694+
ext4_free_data_in_buddy(sb, entry);
36473695

3648-
if (discard_bio) {
3649-
submit_bio_wait(discard_bio);
3650-
bio_put(discard_bio);
3651-
}
3696+
if (test_opt(sb, DISCARD)) {
3697+
spin_lock(&sbi->s_md_lock);
3698+
wake = list_empty(&sbi->s_discard_list);
3699+
list_splice_tail(&freed_data_list, &sbi->s_discard_list);
3700+
spin_unlock(&sbi->s_md_lock);
3701+
if (wake)
3702+
queue_work(system_unbound_wq, &sbi->s_discard_work);
3703+
} else {
3704+
list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list)
3705+
kmem_cache_free(ext4_free_data_cachep, entry);
36523706
}
3653-
3654-
list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list)
3655-
ext4_free_data_in_buddy(sb, entry);
36563707
}
36573708

36583709
int __init ext4_init_mballoc(void)

0 commit comments

Comments
 (0)