@@ -1866,19 +1866,17 @@ static void set_btree_ioerr(struct extent_buffer *eb)
18661866 * context.
18671867 */
18681868static struct extent_buffer * find_extent_buffer_nolock (
1869- const struct btrfs_fs_info * fs_info , u64 start )
1869+ struct btrfs_fs_info * fs_info , u64 start )
18701870{
18711871 struct extent_buffer * eb ;
1872+ unsigned long index = (start >> fs_info -> sectorsize_bits );
18721873
18731874 rcu_read_lock ();
1874- eb = radix_tree_lookup (& fs_info -> buffer_radix ,
1875- start >> fs_info -> sectorsize_bits );
1876- if (eb && atomic_inc_not_zero (& eb -> refs )) {
1877- rcu_read_unlock ();
1878- return eb ;
1879- }
1875+ eb = xa_load (& fs_info -> buffer_tree , index );
1876+ if (eb && !atomic_inc_not_zero (& eb -> refs ))
1877+ eb = NULL ;
18801878 rcu_read_unlock ();
1881- return NULL ;
1879+ return eb ;
18821880}
18831881
18841882static void end_bbio_meta_write (struct btrfs_bio * bbio )
@@ -2742,11 +2740,10 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo
27422740
27432741 if (!btrfs_meta_is_subpage (fs_info )) {
27442742 /*
2745- * We do this since we'll remove the pages after we've
2746- * removed the eb from the radix tree, so we could race
2747- * and have this page now attached to the new eb. So
2748- * only clear folio if it's still connected to
2749- * this eb.
2743+ * We do this since we'll remove the pages after we've removed
2744+ * the eb from the xarray, so we could race and have this page
2745+ * now attached to the new eb. So only clear folio if it's
2746+ * still connected to this eb.
27502747 */
27512748 if (folio_test_private (folio ) && folio_get_private (folio ) == eb ) {
27522749 BUG_ON (test_bit (EXTENT_BUFFER_DIRTY , & eb -> bflags ));
@@ -2911,9 +2908,9 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
29112908{
29122909 int refs ;
29132910 /*
2914- * The TREE_REF bit is first set when the extent_buffer is added
2915- * to the radix tree . It is also reset, if unset, when a new reference
2916- * is created by find_extent_buffer.
2911+ * The TREE_REF bit is first set when the extent_buffer is added to the
2912+ * xarray . It is also reset, if unset, when a new reference is created
2913+ * by find_extent_buffer.
29172914 *
29182915 * It is only cleared in two cases: freeing the last non-tree
29192916 * reference to the extent_buffer when its STALE bit is set or
@@ -2925,13 +2922,12 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
29252922 * conditions between the calls to check_buffer_tree_ref in those
29262923 * codepaths and clearing TREE_REF in try_release_extent_buffer.
29272924 *
2928- * The actual lifetime of the extent_buffer in the radix tree is
2929- * adequately protected by the refcount, but the TREE_REF bit and
2930- * its corresponding reference are not. To protect against this
2931- * class of races, we call check_buffer_tree_ref from the codepaths
2932- * which trigger io. Note that once io is initiated, TREE_REF can no
2933- * longer be cleared, so that is the moment at which any such race is
2934- * best fixed.
2925+ * The actual lifetime of the extent_buffer in the xarray is adequately
2926+ * protected by the refcount, but the TREE_REF bit and its corresponding
2927+ * reference are not. To protect against this class of races, we call
2928+ * check_buffer_tree_ref() from the code paths which trigger io. Note that
2929+ * once io is initiated, TREE_REF can no longer be cleared, so that is
2930+ * the moment at which any such race is best fixed.
29352931 */
29362932 refs = atomic_read (& eb -> refs );
29372933 if (refs >= 2 && test_bit (EXTENT_BUFFER_TREE_REF , & eb -> bflags ))
@@ -2995,23 +2991,25 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
29952991 return ERR_PTR (- ENOMEM );
29962992 eb -> fs_info = fs_info ;
29972993again :
2998- ret = radix_tree_preload (GFP_NOFS );
2999- if (ret ) {
3000- exists = ERR_PTR (ret );
3001- goto free_eb ;
2994+ xa_lock_irq (& fs_info -> buffer_tree );
2995+ exists = __xa_cmpxchg (& fs_info -> buffer_tree , start >> fs_info -> sectorsize_bits ,
2996+ NULL , eb , GFP_NOFS );
2997+ if (xa_is_err (exists )) {
2998+ ret = xa_err (exists );
2999+ xa_unlock_irq (& fs_info -> buffer_tree );
3000+ btrfs_release_extent_buffer (eb );
3001+ return ERR_PTR (ret );
30023002 }
3003- spin_lock (& fs_info -> buffer_lock );
3004- ret = radix_tree_insert (& fs_info -> buffer_radix ,
3005- start >> fs_info -> sectorsize_bits , eb );
3006- spin_unlock (& fs_info -> buffer_lock );
3007- radix_tree_preload_end ();
3008- if (ret == - EEXIST ) {
3009- exists = find_extent_buffer (fs_info , start );
3010- if (exists )
3011- goto free_eb ;
3012- else
3003+ if (exists ) {
3004+ if (!atomic_inc_not_zero (& exists -> refs )) {
3005+ /* The extent buffer is being freed, retry. */
3006+ xa_unlock_irq (& fs_info -> buffer_tree );
30133007 goto again ;
3008+ }
3009+ xa_unlock_irq (& fs_info -> buffer_tree );
3010+ goto free_eb ;
30143011 }
3012+ xa_unlock_irq (& fs_info -> buffer_tree );
30153013 check_buffer_tree_ref (eb );
30163014
30173015 return eb ;
@@ -3032,9 +3030,9 @@ static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info,
30323030 lockdep_assert_held (& folio -> mapping -> i_private_lock );
30333031
30343032 /*
3035- * For subpage case, we completely rely on radix tree to ensure we
3036- * don't try to insert two ebs for the same bytenr. So here we always
3037- * return NULL and just continue.
3033+ * For subpage case, we completely rely on xarray to ensure we don't try
3034+ * to insert two ebs for the same bytenr. So here we always return NULL
3035+ * and just continue.
30383036 */
30393037 if (btrfs_meta_is_subpage (fs_info ))
30403038 return NULL ;
@@ -3165,7 +3163,7 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
31653163 /*
31663164 * To inform we have an extra eb under allocation, so that
31673165 * detach_extent_buffer_page() won't release the folio private when the
3168- * eb hasn't been inserted into radix tree yet.
3166+ * eb hasn't been inserted into the xarray yet.
31693167 *
31703168 * The ref will be decreased when the eb releases the page, in
31713169 * detach_extent_buffer_page(). Thus needs no special handling in the
@@ -3299,10 +3297,9 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
32993297
33003298 /*
33013299 * We can't unlock the pages just yet since the extent buffer
3302- * hasn't been properly inserted in the radix tree, this
3303- * opens a race with btree_release_folio which can free a page
3304- * while we are still filling in all pages for the buffer and
3305- * we could crash.
3300+ * hasn't been properly inserted into the xarray, this opens a
3301+ * race with btree_release_folio() which can free a page while we
3302+ * are still filling in all pages for the buffer and we could crash.
33063303 */
33073304 }
33083305 if (uptodate )
@@ -3311,23 +3308,25 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
33113308 if (page_contig )
33123309 eb -> addr = folio_address (eb -> folios [0 ]) + offset_in_page (eb -> start );
33133310again :
3314- ret = radix_tree_preload (GFP_NOFS );
3315- if (ret )
3311+ xa_lock_irq (& fs_info -> buffer_tree );
3312+ existing_eb = __xa_cmpxchg (& fs_info -> buffer_tree ,
3313+ start >> fs_info -> sectorsize_bits , NULL , eb ,
3314+ GFP_NOFS );
3315+ if (xa_is_err (existing_eb )) {
3316+ ret = xa_err (existing_eb );
3317+ xa_unlock_irq (& fs_info -> buffer_tree );
33163318 goto out ;
3317-
3318- spin_lock (& fs_info -> buffer_lock );
3319- ret = radix_tree_insert (& fs_info -> buffer_radix ,
3320- start >> fs_info -> sectorsize_bits , eb );
3321- spin_unlock (& fs_info -> buffer_lock );
3322- radix_tree_preload_end ();
3323- if (ret == - EEXIST ) {
3324- ret = 0 ;
3325- existing_eb = find_extent_buffer (fs_info , start );
3326- if (existing_eb )
3327- goto out ;
3328- else
3319+ }
3320+ if (existing_eb ) {
3321+ if (!atomic_inc_not_zero (& existing_eb -> refs )) {
3322+ xa_unlock_irq (& fs_info -> buffer_tree );
33293323 goto again ;
3324+ }
3325+ xa_unlock_irq (& fs_info -> buffer_tree );
3326+ goto out ;
33303327 }
3328+ xa_unlock_irq (& fs_info -> buffer_tree );
3329+
33313330 /* add one reference for the tree */
33323331 check_buffer_tree_ref (eb );
33333332
@@ -3397,10 +3396,23 @@ static int release_extent_buffer(struct extent_buffer *eb)
33973396
33983397 spin_unlock (& eb -> refs_lock );
33993398
3400- spin_lock (& fs_info -> buffer_lock );
3401- radix_tree_delete_item (& fs_info -> buffer_radix ,
3402- eb -> start >> fs_info -> sectorsize_bits , eb );
3403- spin_unlock (& fs_info -> buffer_lock );
3399+ /*
3400+ * We're erasing, theoretically there will be no allocations, so
3401+ * just use GFP_ATOMIC.
3402+ *
3403+ * We use cmpxchg instead of erase because we do not know if
3404+ * this eb is actually in the tree or not, we could be cleaning
3405+ * up an eb that we allocated but never inserted into the tree.
3406+ * Thus use cmpxchg to remove it from the tree if it is there,
3407+ * or leave the other entry if this isn't in the tree.
3408+ *
3409+ * The documentation says that putting a NULL value is the same
3410+ * as erase as long as XA_FLAGS_ALLOC is not set, which it isn't
3411+ * in this case.
3412+ */
3413+ xa_cmpxchg_irq (& fs_info -> buffer_tree ,
3414+ eb -> start >> fs_info -> sectorsize_bits , eb , NULL ,
3415+ GFP_ATOMIC );
34043416
34053417 btrfs_leak_debug_del_eb (eb );
34063418 /* Should be safe to release folios at this point. */
@@ -4231,82 +4243,27 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
42314243 }
42324244}
42334245
4234- #define GANG_LOOKUP_SIZE 16
4235- static struct extent_buffer * get_next_extent_buffer (
4236- const struct btrfs_fs_info * fs_info , struct folio * folio , u64 bytenr )
4237- {
4238- struct extent_buffer * gang [GANG_LOOKUP_SIZE ];
4239- struct extent_buffer * found = NULL ;
4240- u64 folio_start = folio_pos (folio );
4241- u64 cur = folio_start ;
4242-
4243- ASSERT (in_range (bytenr , folio_start , PAGE_SIZE ));
4244- lockdep_assert_held (& fs_info -> buffer_lock );
4245-
4246- while (cur < folio_start + PAGE_SIZE ) {
4247- int ret ;
4248- int i ;
4249-
4250- ret = radix_tree_gang_lookup (& fs_info -> buffer_radix ,
4251- (void * * )gang , cur >> fs_info -> sectorsize_bits ,
4252- min_t (unsigned int , GANG_LOOKUP_SIZE ,
4253- PAGE_SIZE / fs_info -> nodesize ));
4254- if (ret == 0 )
4255- goto out ;
4256- for (i = 0 ; i < ret ; i ++ ) {
4257- /* Already beyond page end */
4258- if (gang [i ]-> start >= folio_start + PAGE_SIZE )
4259- goto out ;
4260- /* Found one */
4261- if (gang [i ]-> start >= bytenr ) {
4262- found = gang [i ];
4263- goto out ;
4264- }
4265- }
4266- cur = gang [ret - 1 ]-> start + gang [ret - 1 ]-> len ;
4267- }
4268- out :
4269- return found ;
4270- }
4271-
42724246static int try_release_subpage_extent_buffer (struct folio * folio )
42734247{
42744248 struct btrfs_fs_info * fs_info = folio_to_fs_info (folio );
4275- u64 cur = folio_pos (folio );
4276- const u64 end = cur + PAGE_SIZE ;
4249+ struct extent_buffer * eb ;
4250+ unsigned long start = (folio_pos (folio ) >> fs_info -> sectorsize_bits );
4251+ unsigned long index = start ;
4252+ unsigned long end = index + (PAGE_SIZE >> fs_info -> sectorsize_bits ) - 1 ;
42774253 int ret ;
42784254
4279- while (cur < end ) {
4280- struct extent_buffer * eb = NULL ;
4281-
4282- /*
4283- * Unlike try_release_extent_buffer() which uses folio private
4284- * to grab buffer, for subpage case we rely on radix tree, thus
4285- * we need to ensure radix tree consistency.
4286- *
4287- * We also want an atomic snapshot of the radix tree, thus go
4288- * with spinlock rather than RCU.
4289- */
4290- spin_lock (& fs_info -> buffer_lock );
4291- eb = get_next_extent_buffer (fs_info , folio , cur );
4292- if (!eb ) {
4293- /* No more eb in the page range after or at cur */
4294- spin_unlock (& fs_info -> buffer_lock );
4295- break ;
4296- }
4297- cur = eb -> start + eb -> len ;
4298-
4255+ xa_lock_irq (& fs_info -> buffer_tree );
4256+ xa_for_each_range (& fs_info -> buffer_tree , index , eb , start , end ) {
42994257 /*
43004258 * The same as try_release_extent_buffer(), to ensure the eb
43014259 * won't disappear out from under us.
43024260 */
43034261 spin_lock (& eb -> refs_lock );
43044262 if (atomic_read (& eb -> refs ) != 1 || extent_buffer_under_io (eb )) {
43054263 spin_unlock (& eb -> refs_lock );
4306- spin_unlock (& fs_info -> buffer_lock );
4307- break ;
4264+ continue ;
43084265 }
4309- spin_unlock (& fs_info -> buffer_lock );
4266+ xa_unlock_irq (& fs_info -> buffer_tree );
43104267
43114268 /*
43124269 * If tree ref isn't set then we know the ref on this eb is a
@@ -4324,7 +4281,10 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
43244281 * release_extent_buffer() will release the refs_lock.
43254282 */
43264283 release_extent_buffer (eb );
4284+ xa_lock_irq (& fs_info -> buffer_tree );
43274285 }
4286+ xa_unlock_irq (& fs_info -> buffer_tree );
4287+
43284288 /*
43294289 * Finally to check if we have cleared folio private, as if we have
43304290 * released all ebs in the page, the folio private should be cleared now.
0 commit comments