Skip to content

Commit 19d7f65

Browse files
josefbacikkdave
authored andcommitted
btrfs: convert the buffer_radix to an xarray
In order to fully utilize xarray tagging to improve writeback we need to convert the buffer_radix to a proper xarray. This conversion is relatively straightforward as the radix code uses the xarray underneath. Using xarray directly allows for quite a lot less code. Reviewed-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Josef Bacik <josef@toxicpanda.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
1 parent 656e9f5 commit 19d7f65

File tree

5 files changed

+109
-165
lines changed

5 files changed

+109
-165
lines changed

fs/btrfs/disk-io.c

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2761,10 +2761,21 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
27612761
return ret;
27622762
}
27632763

2764+
/*
2765+
* Lockdep gets confused between our buffer_tree which requires IRQ locking because
2766+
* we modify marks in the IRQ context, and our delayed inode xarray which doesn't
2767+
* have these requirements. Use a class key so lockdep doesn't get them mixed up.
2768+
*/
2769+
static struct lock_class_key buffer_xa_class;
2770+
27642771
void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
27652772
{
27662773
INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
2767-
INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
2774+
2775+
/* Use the same flags as mapping->i_pages. */
2776+
xa_init_flags(&fs_info->buffer_tree, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
2777+
lockdep_set_class(&fs_info->buffer_tree.xa_lock, &buffer_xa_class);
2778+
27682779
INIT_LIST_HEAD(&fs_info->trans_list);
27692780
INIT_LIST_HEAD(&fs_info->dead_roots);
27702781
INIT_LIST_HEAD(&fs_info->delayed_iputs);
@@ -2776,7 +2787,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
27762787
spin_lock_init(&fs_info->delayed_iput_lock);
27772788
spin_lock_init(&fs_info->defrag_inodes_lock);
27782789
spin_lock_init(&fs_info->super_lock);
2779-
spin_lock_init(&fs_info->buffer_lock);
27802790
spin_lock_init(&fs_info->unused_bgs_lock);
27812791
spin_lock_init(&fs_info->treelog_bg_lock);
27822792
spin_lock_init(&fs_info->zone_active_bgs_lock);

fs/btrfs/extent_io.c

Lines changed: 86 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -1866,19 +1866,17 @@ static void set_btree_ioerr(struct extent_buffer *eb)
18661866
* context.
18671867
*/
18681868
static struct extent_buffer *find_extent_buffer_nolock(
1869-
const struct btrfs_fs_info *fs_info, u64 start)
1869+
struct btrfs_fs_info *fs_info, u64 start)
18701870
{
18711871
struct extent_buffer *eb;
1872+
unsigned long index = (start >> fs_info->sectorsize_bits);
18721873

18731874
rcu_read_lock();
1874-
eb = radix_tree_lookup(&fs_info->buffer_radix,
1875-
start >> fs_info->sectorsize_bits);
1876-
if (eb && atomic_inc_not_zero(&eb->refs)) {
1877-
rcu_read_unlock();
1878-
return eb;
1879-
}
1875+
eb = xa_load(&fs_info->buffer_tree, index);
1876+
if (eb && !atomic_inc_not_zero(&eb->refs))
1877+
eb = NULL;
18801878
rcu_read_unlock();
1881-
return NULL;
1879+
return eb;
18821880
}
18831881

18841882
static void end_bbio_meta_write(struct btrfs_bio *bbio)
@@ -2742,11 +2740,10 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo
27422740

27432741
if (!btrfs_meta_is_subpage(fs_info)) {
27442742
/*
2745-
* We do this since we'll remove the pages after we've
2746-
* removed the eb from the radix tree, so we could race
2747-
* and have this page now attached to the new eb. So
2748-
* only clear folio if it's still connected to
2749-
* this eb.
2743+
* We do this since we'll remove the pages after we've removed
2744+
* the eb from the xarray, so we could race and have this page
2745+
* now attached to the new eb. So only clear folio if it's
2746+
* still connected to this eb.
27502747
*/
27512748
if (folio_test_private(folio) && folio_get_private(folio) == eb) {
27522749
BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
@@ -2911,9 +2908,9 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
29112908
{
29122909
int refs;
29132910
/*
2914-
* The TREE_REF bit is first set when the extent_buffer is added
2915-
* to the radix tree. It is also reset, if unset, when a new reference
2916-
* is created by find_extent_buffer.
2911+
* The TREE_REF bit is first set when the extent_buffer is added to the
2912+
* xarray. It is also reset, if unset, when a new reference is created
2913+
* by find_extent_buffer.
29172914
*
29182915
* It is only cleared in two cases: freeing the last non-tree
29192916
* reference to the extent_buffer when its STALE bit is set or
@@ -2925,13 +2922,12 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
29252922
* conditions between the calls to check_buffer_tree_ref in those
29262923
* codepaths and clearing TREE_REF in try_release_extent_buffer.
29272924
*
2928-
* The actual lifetime of the extent_buffer in the radix tree is
2929-
* adequately protected by the refcount, but the TREE_REF bit and
2930-
* its corresponding reference are not. To protect against this
2931-
* class of races, we call check_buffer_tree_ref from the codepaths
2932-
* which trigger io. Note that once io is initiated, TREE_REF can no
2933-
* longer be cleared, so that is the moment at which any such race is
2934-
* best fixed.
2925+
* The actual lifetime of the extent_buffer in the xarray is adequately
2926+
* protected by the refcount, but the TREE_REF bit and its corresponding
2927+
* reference are not. To protect against this class of races, we call
2928+
* check_buffer_tree_ref() from the code paths which trigger io. Note that
2929+
* once io is initiated, TREE_REF can no longer be cleared, so that is
2930+
* the moment at which any such race is best fixed.
29352931
*/
29362932
refs = atomic_read(&eb->refs);
29372933
if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
@@ -2995,23 +2991,25 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
29952991
return ERR_PTR(-ENOMEM);
29962992
eb->fs_info = fs_info;
29972993
again:
2998-
ret = radix_tree_preload(GFP_NOFS);
2999-
if (ret) {
3000-
exists = ERR_PTR(ret);
3001-
goto free_eb;
2994+
xa_lock_irq(&fs_info->buffer_tree);
2995+
exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->sectorsize_bits,
2996+
NULL, eb, GFP_NOFS);
2997+
if (xa_is_err(exists)) {
2998+
ret = xa_err(exists);
2999+
xa_unlock_irq(&fs_info->buffer_tree);
3000+
btrfs_release_extent_buffer(eb);
3001+
return ERR_PTR(ret);
30023002
}
3003-
spin_lock(&fs_info->buffer_lock);
3004-
ret = radix_tree_insert(&fs_info->buffer_radix,
3005-
start >> fs_info->sectorsize_bits, eb);
3006-
spin_unlock(&fs_info->buffer_lock);
3007-
radix_tree_preload_end();
3008-
if (ret == -EEXIST) {
3009-
exists = find_extent_buffer(fs_info, start);
3010-
if (exists)
3011-
goto free_eb;
3012-
else
3003+
if (exists) {
3004+
if (!atomic_inc_not_zero(&exists->refs)) {
3005+
/* The extent buffer is being freed, retry. */
3006+
xa_unlock_irq(&fs_info->buffer_tree);
30133007
goto again;
3008+
}
3009+
xa_unlock_irq(&fs_info->buffer_tree);
3010+
goto free_eb;
30143011
}
3012+
xa_unlock_irq(&fs_info->buffer_tree);
30153013
check_buffer_tree_ref(eb);
30163014

30173015
return eb;
@@ -3032,9 +3030,9 @@ static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info,
30323030
lockdep_assert_held(&folio->mapping->i_private_lock);
30333031

30343032
/*
3035-
* For subpage case, we completely rely on radix tree to ensure we
3036-
* don't try to insert two ebs for the same bytenr. So here we always
3037-
* return NULL and just continue.
3033+
* For subpage case, we completely rely on xarray to ensure we don't try
3034+
* to insert two ebs for the same bytenr. So here we always return NULL
3035+
* and just continue.
30383036
*/
30393037
if (btrfs_meta_is_subpage(fs_info))
30403038
return NULL;
@@ -3165,7 +3163,7 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
31653163
/*
31663164
* To inform we have an extra eb under allocation, so that
31673165
* detach_extent_buffer_page() won't release the folio private when the
3168-
* eb hasn't been inserted into radix tree yet.
3166+
* eb hasn't been inserted into the xarray yet.
31693167
*
31703168
* The ref will be decreased when the eb releases the page, in
31713169
* detach_extent_buffer_page(). Thus needs no special handling in the
@@ -3299,10 +3297,9 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
32993297

33003298
/*
33013299
* We can't unlock the pages just yet since the extent buffer
3302-
* hasn't been properly inserted in the radix tree, this
3303-
* opens a race with btree_release_folio which can free a page
3304-
* while we are still filling in all pages for the buffer and
3305-
* we could crash.
3300+
* hasn't been properly inserted into the xarray, this opens a
3301+
* race with btree_release_folio() which can free a page while we
3302+
* are still filling in all pages for the buffer and we could crash.
33063303
*/
33073304
}
33083305
if (uptodate)
@@ -3311,23 +3308,25 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
33113308
if (page_contig)
33123309
eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start);
33133310
again:
3314-
ret = radix_tree_preload(GFP_NOFS);
3315-
if (ret)
3311+
xa_lock_irq(&fs_info->buffer_tree);
3312+
existing_eb = __xa_cmpxchg(&fs_info->buffer_tree,
3313+
start >> fs_info->sectorsize_bits, NULL, eb,
3314+
GFP_NOFS);
3315+
if (xa_is_err(existing_eb)) {
3316+
ret = xa_err(existing_eb);
3317+
xa_unlock_irq(&fs_info->buffer_tree);
33163318
goto out;
3317-
3318-
spin_lock(&fs_info->buffer_lock);
3319-
ret = radix_tree_insert(&fs_info->buffer_radix,
3320-
start >> fs_info->sectorsize_bits, eb);
3321-
spin_unlock(&fs_info->buffer_lock);
3322-
radix_tree_preload_end();
3323-
if (ret == -EEXIST) {
3324-
ret = 0;
3325-
existing_eb = find_extent_buffer(fs_info, start);
3326-
if (existing_eb)
3327-
goto out;
3328-
else
3319+
}
3320+
if (existing_eb) {
3321+
if (!atomic_inc_not_zero(&existing_eb->refs)) {
3322+
xa_unlock_irq(&fs_info->buffer_tree);
33293323
goto again;
3324+
}
3325+
xa_unlock_irq(&fs_info->buffer_tree);
3326+
goto out;
33303327
}
3328+
xa_unlock_irq(&fs_info->buffer_tree);
3329+
33313330
/* add one reference for the tree */
33323331
check_buffer_tree_ref(eb);
33333332

@@ -3397,10 +3396,23 @@ static int release_extent_buffer(struct extent_buffer *eb)
33973396

33983397
spin_unlock(&eb->refs_lock);
33993398

3400-
spin_lock(&fs_info->buffer_lock);
3401-
radix_tree_delete_item(&fs_info->buffer_radix,
3402-
eb->start >> fs_info->sectorsize_bits, eb);
3403-
spin_unlock(&fs_info->buffer_lock);
3399+
/*
3400+
* We're erasing, theoretically there will be no allocations, so
3401+
* just use GFP_ATOMIC.
3402+
*
3403+
* We use cmpxchg instead of erase because we do not know if
3404+
* this eb is actually in the tree or not, we could be cleaning
3405+
* up an eb that we allocated but never inserted into the tree.
3406+
* Thus use cmpxchg to remove it from the tree if it is there,
3407+
* or leave the other entry if this isn't in the tree.
3408+
*
3409+
* The documentation says that putting a NULL value is the same
3410+
* as erase as long as XA_FLAGS_ALLOC is not set, which it isn't
3411+
* in this case.
3412+
*/
3413+
xa_cmpxchg_irq(&fs_info->buffer_tree,
3414+
eb->start >> fs_info->sectorsize_bits, eb, NULL,
3415+
GFP_ATOMIC);
34043416

34053417
btrfs_leak_debug_del_eb(eb);
34063418
/* Should be safe to release folios at this point. */
@@ -4231,82 +4243,27 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
42314243
}
42324244
}
42334245

4234-
#define GANG_LOOKUP_SIZE 16
4235-
static struct extent_buffer *get_next_extent_buffer(
4236-
const struct btrfs_fs_info *fs_info, struct folio *folio, u64 bytenr)
4237-
{
4238-
struct extent_buffer *gang[GANG_LOOKUP_SIZE];
4239-
struct extent_buffer *found = NULL;
4240-
u64 folio_start = folio_pos(folio);
4241-
u64 cur = folio_start;
4242-
4243-
ASSERT(in_range(bytenr, folio_start, PAGE_SIZE));
4244-
lockdep_assert_held(&fs_info->buffer_lock);
4245-
4246-
while (cur < folio_start + PAGE_SIZE) {
4247-
int ret;
4248-
int i;
4249-
4250-
ret = radix_tree_gang_lookup(&fs_info->buffer_radix,
4251-
(void **)gang, cur >> fs_info->sectorsize_bits,
4252-
min_t(unsigned int, GANG_LOOKUP_SIZE,
4253-
PAGE_SIZE / fs_info->nodesize));
4254-
if (ret == 0)
4255-
goto out;
4256-
for (i = 0; i < ret; i++) {
4257-
/* Already beyond page end */
4258-
if (gang[i]->start >= folio_start + PAGE_SIZE)
4259-
goto out;
4260-
/* Found one */
4261-
if (gang[i]->start >= bytenr) {
4262-
found = gang[i];
4263-
goto out;
4264-
}
4265-
}
4266-
cur = gang[ret - 1]->start + gang[ret - 1]->len;
4267-
}
4268-
out:
4269-
return found;
4270-
}
4271-
42724246
static int try_release_subpage_extent_buffer(struct folio *folio)
42734247
{
42744248
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
4275-
u64 cur = folio_pos(folio);
4276-
const u64 end = cur + PAGE_SIZE;
4249+
struct extent_buffer *eb;
4250+
unsigned long start = (folio_pos(folio) >> fs_info->sectorsize_bits);
4251+
unsigned long index = start;
4252+
unsigned long end = index + (PAGE_SIZE >> fs_info->sectorsize_bits) - 1;
42774253
int ret;
42784254

4279-
while (cur < end) {
4280-
struct extent_buffer *eb = NULL;
4281-
4282-
/*
4283-
* Unlike try_release_extent_buffer() which uses folio private
4284-
* to grab buffer, for subpage case we rely on radix tree, thus
4285-
* we need to ensure radix tree consistency.
4286-
*
4287-
* We also want an atomic snapshot of the radix tree, thus go
4288-
* with spinlock rather than RCU.
4289-
*/
4290-
spin_lock(&fs_info->buffer_lock);
4291-
eb = get_next_extent_buffer(fs_info, folio, cur);
4292-
if (!eb) {
4293-
/* No more eb in the page range after or at cur */
4294-
spin_unlock(&fs_info->buffer_lock);
4295-
break;
4296-
}
4297-
cur = eb->start + eb->len;
4298-
4255+
xa_lock_irq(&fs_info->buffer_tree);
4256+
xa_for_each_range(&fs_info->buffer_tree, index, eb, start, end) {
42994257
/*
43004258
* The same as try_release_extent_buffer(), to ensure the eb
43014259
* won't disappear out from under us.
43024260
*/
43034261
spin_lock(&eb->refs_lock);
43044262
if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
43054263
spin_unlock(&eb->refs_lock);
4306-
spin_unlock(&fs_info->buffer_lock);
4307-
break;
4264+
continue;
43084265
}
4309-
spin_unlock(&fs_info->buffer_lock);
4266+
xa_unlock_irq(&fs_info->buffer_tree);
43104267

43114268
/*
43124269
* If tree ref isn't set then we know the ref on this eb is a
@@ -4324,7 +4281,10 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
43244281
* release_extent_buffer() will release the refs_lock.
43254282
*/
43264283
release_extent_buffer(eb);
4284+
xa_lock_irq(&fs_info->buffer_tree);
43274285
}
4286+
xa_unlock_irq(&fs_info->buffer_tree);
4287+
43284288
/*
43294289
* Finally to check if we have cleared folio private, as if we have
43304290
* released all ebs in the page, the folio private should be cleared now.

fs/btrfs/fs.h

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -777,10 +777,8 @@ struct btrfs_fs_info {
777777

778778
struct btrfs_delayed_root *delayed_root;
779779

780-
/* Extent buffer radix tree */
781-
spinlock_t buffer_lock;
782780
/* Entries are eb->start / sectorsize */
783-
struct radix_tree_root buffer_radix;
781+
struct xarray buffer_tree;
784782

785783
/* Next backup root to be overwritten */
786784
int backup_root_index;

0 commit comments

Comments
 (0)