Skip to content

Commit 5e121ae

Browse files
josefbacikkdave
authored andcommitted
btrfs: use buffer xarray for extent buffer writeback operations
Currently we have this ugly back and forth with the btree writeback where we find the folio, find the eb associated with that folio, and then attempt to writeback. This results in two different paths for subpage ebs and >= page size ebs. Clean this up by adding our own infrastructure around looking up tagged ebs and writing the ebs out directly. This allows us to unify the subpage and >= pagesize IO paths, resulting in a much cleaner writeback path for extent buffers. I ran this through fsperf on a VM with 8 CPUs and 16GiB of RAM. I used smallfiles100k, but reduced the files to 1k to make it run faster, the results are as follows, with the statistically significant improvements marked with *, there were no regressions. fsperf was run with -n 10 for both runs, so the baseline is the average 10 runs and the test is the average of 10 runs. smallfiles100k results metric baseline current stdev diff ================================================================================ avg_commit_ms 68.58 58.44 3.35 -14.79% * commits 270.60 254.70 16.24 -5.88% dev_read_iops 48 48 0 0.00% dev_read_kbytes 1044 1044 0 0.00% dev_write_iops 866117.90 850028.10 14292.20 -1.86% dev_write_kbytes 10939976.40 10605701.20 351330.32 -3.06% elapsed 49.30 33 1.64 -33.06% * end_state_mount_ns 41251498.80 35773220.70 2531205.32 -13.28% * end_state_umount_ns 1.90e+09 1.50e+09 14186226.85 -21.38% * max_commit_ms 139 111.60 9.72 -19.71% * sys_cpu 4.90 3.86 0.88 -21.29% write_bw_bytes 42935768.20 64318451.10 1609415.05 49.80% * write_clat_ns_mean 366431.69 243202.60 14161.98 -33.63% * write_clat_ns_p50 49203.20 20992 264.40 -57.34% * write_clat_ns_p99 827392 653721.60 65904.74 -20.99% * write_io_kbytes 2035940 2035940 0 0.00% write_iops 10482.37 15702.75 392.92 49.80% * write_lat_ns_max 1.01e+08 90516129 3910102.06 -10.29% * write_lat_ns_mean 366556.19 243308.48 14154.51 -33.62% * As you can see we get about a 33% decrease runtime, with a 50% throughput increase, which is pretty significant. Reviewed-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Josef Bacik <josef@toxicpanda.com> Signed-off-by: David Sterba <dsterba@suse.com>
1 parent 4bc0a3c commit 5e121ae

File tree

3 files changed

+166
-177
lines changed

3 files changed

+166
-177
lines changed

fs/btrfs/extent_io.c

Lines changed: 163 additions & 174 deletions
Original file line numberDiff line numberDiff line change
@@ -1895,6 +1895,111 @@ static void buffer_tree_clear_mark(const struct extent_buffer *eb, xa_mark_t mar
18951895
xas_unlock_irqrestore(&xas, flags);
18961896
}
18971897

1898+
static void buffer_tree_tag_for_writeback(struct btrfs_fs_info *fs_info,
1899+
unsigned long start, unsigned long end)
1900+
{
1901+
XA_STATE(xas, &fs_info->buffer_tree, start);
1902+
unsigned int tagged = 0;
1903+
void *eb;
1904+
1905+
xas_lock_irq(&xas);
1906+
xas_for_each_marked(&xas, eb, end, PAGECACHE_TAG_DIRTY) {
1907+
xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE);
1908+
if (++tagged % XA_CHECK_SCHED)
1909+
continue;
1910+
xas_pause(&xas);
1911+
xas_unlock_irq(&xas);
1912+
cond_resched();
1913+
xas_lock_irq(&xas);
1914+
}
1915+
xas_unlock_irq(&xas);
1916+
}
1917+
1918+
struct eb_batch {
1919+
unsigned int nr;
1920+
unsigned int cur;
1921+
struct extent_buffer *ebs[PAGEVEC_SIZE];
1922+
};
1923+
1924+
static inline bool eb_batch_add(struct eb_batch *batch, struct extent_buffer *eb)
1925+
{
1926+
batch->ebs[batch->nr++] = eb;
1927+
return (batch->nr < PAGEVEC_SIZE);
1928+
}
1929+
1930+
static inline void eb_batch_init(struct eb_batch *batch)
1931+
{
1932+
batch->nr = 0;
1933+
batch->cur = 0;
1934+
}
1935+
1936+
static inline struct extent_buffer *eb_batch_next(struct eb_batch *batch)
1937+
{
1938+
if (batch->cur >= batch->nr)
1939+
return NULL;
1940+
return batch->ebs[batch->cur++];
1941+
}
1942+
1943+
static inline void eb_batch_release(struct eb_batch *batch)
1944+
{
1945+
for (unsigned int i = 0; i < batch->nr; i++)
1946+
free_extent_buffer(batch->ebs[i]);
1947+
eb_batch_init(batch);
1948+
}
1949+
1950+
static inline struct extent_buffer *find_get_eb(struct xa_state *xas, unsigned long max,
1951+
xa_mark_t mark)
1952+
{
1953+
struct extent_buffer *eb;
1954+
1955+
retry:
1956+
eb = xas_find_marked(xas, max, mark);
1957+
1958+
if (xas_retry(xas, eb))
1959+
goto retry;
1960+
1961+
if (!eb)
1962+
return NULL;
1963+
1964+
if (!atomic_inc_not_zero(&eb->refs)) {
1965+
xas_reset(xas);
1966+
goto retry;
1967+
}
1968+
1969+
if (unlikely(eb != xas_reload(xas))) {
1970+
free_extent_buffer(eb);
1971+
xas_reset(xas);
1972+
goto retry;
1973+
}
1974+
1975+
return eb;
1976+
}
1977+
1978+
static unsigned int buffer_tree_get_ebs_tag(struct btrfs_fs_info *fs_info,
1979+
unsigned long *start,
1980+
unsigned long end, xa_mark_t tag,
1981+
struct eb_batch *batch)
1982+
{
1983+
XA_STATE(xas, &fs_info->buffer_tree, *start);
1984+
struct extent_buffer *eb;
1985+
1986+
rcu_read_lock();
1987+
while ((eb = find_get_eb(&xas, end, tag)) != NULL) {
1988+
if (!eb_batch_add(batch, eb)) {
1989+
*start = ((eb->start + eb->len) >> fs_info->sectorsize_bits);
1990+
goto out;
1991+
}
1992+
}
1993+
if (end == ULONG_MAX)
1994+
*start = ULONG_MAX;
1995+
else
1996+
*start = end + 1;
1997+
out:
1998+
rcu_read_unlock();
1999+
2000+
return batch->nr;
2001+
}
2002+
18982003
/*
18992004
* The endio specific version which won't touch any unsafe spinlock in endio
19002005
* context.
@@ -1997,163 +2102,36 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
19972102
}
19982103

19992104
/*
2000-
* Submit one subpage btree page.
2105+
* Wait for all eb writeback in the given range to finish.
20012106
*
2002-
* The main difference to submit_eb_page() is:
2003-
* - Page locking
2004-
* For subpage, we don't rely on page locking at all.
2005-
*
2006-
* - Flush write bio
2007-
* We only flush bio if we may be unable to fit current extent buffers into
2008-
* current bio.
2009-
*
2010-
* Return >=0 for the number of submitted extent buffers.
2011-
* Return <0 for fatal error.
2107+
* @fs_info: The fs_info for this file system.
2108+
* @start: The offset of the range to start waiting on writeback.
2109+
* @end: The end of the range, inclusive. This is meant to be used in
2110+
* conjuction with wait_marked_extents, so this will usually be
2111+
* the_next_eb->start - 1.
20122112
*/
2013-
static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc)
2113+
void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start,
2114+
u64 end)
20142115
{
2015-
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
2016-
int submitted = 0;
2017-
u64 folio_start = folio_pos(folio);
2018-
int bit_start = 0;
2019-
int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits;
2020-
const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
2116+
struct eb_batch batch;
2117+
unsigned long start_index = (start >> fs_info->sectorsize_bits);
2118+
unsigned long end_index = (end >> fs_info->sectorsize_bits);
20212119

2022-
/* Lock and write each dirty extent buffers in the range */
2023-
while (bit_start < blocks_per_folio) {
2024-
struct btrfs_subpage *subpage = folio_get_private(folio);
2120+
eb_batch_init(&batch);
2121+
while (start_index <= end_index) {
20252122
struct extent_buffer *eb;
2026-
unsigned long flags;
2027-
u64 start;
2123+
unsigned int nr_ebs;
20282124

2029-
/*
2030-
* Take private lock to ensure the subpage won't be detached
2031-
* in the meantime.
2032-
*/
2033-
spin_lock(&folio->mapping->i_private_lock);
2034-
if (!folio_test_private(folio)) {
2035-
spin_unlock(&folio->mapping->i_private_lock);
2125+
nr_ebs = buffer_tree_get_ebs_tag(fs_info, &start_index, end_index,
2126+
PAGECACHE_TAG_WRITEBACK, &batch);
2127+
if (!nr_ebs)
20362128
break;
2037-
}
2038-
spin_lock_irqsave(&subpage->lock, flags);
2039-
if (!test_bit(bit_start + btrfs_bitmap_nr_dirty * blocks_per_folio,
2040-
subpage->bitmaps)) {
2041-
spin_unlock_irqrestore(&subpage->lock, flags);
2042-
spin_unlock(&folio->mapping->i_private_lock);
2043-
bit_start += sectors_per_node;
2044-
continue;
2045-
}
2046-
2047-
start = folio_start + bit_start * fs_info->sectorsize;
2048-
bit_start += sectors_per_node;
2049-
2050-
/*
2051-
* Here we just want to grab the eb without touching extra
2052-
* spin locks, so call find_extent_buffer_nolock().
2053-
*/
2054-
eb = find_extent_buffer_nolock(fs_info, start);
2055-
spin_unlock_irqrestore(&subpage->lock, flags);
2056-
spin_unlock(&folio->mapping->i_private_lock);
2057-
2058-
/*
2059-
* The eb has already reached 0 refs thus find_extent_buffer()
2060-
* doesn't return it. We don't need to write back such eb
2061-
* anyway.
2062-
*/
2063-
if (!eb)
2064-
continue;
2065-
2066-
if (lock_extent_buffer_for_io(eb, wbc)) {
2067-
write_one_eb(eb, wbc);
2068-
submitted++;
2069-
}
2070-
free_extent_buffer(eb);
2071-
}
2072-
return submitted;
2073-
}
2074-
2075-
/*
2076-
* Submit all page(s) of one extent buffer.
2077-
*
2078-
* @page: the page of one extent buffer
2079-
* @eb_context: to determine if we need to submit this page, if current page
2080-
* belongs to this eb, we don't need to submit
2081-
*
2082-
* The caller should pass each page in their bytenr order, and here we use
2083-
* @eb_context to determine if we have submitted pages of one extent buffer.
2084-
*
2085-
* If we have, we just skip until we hit a new page that doesn't belong to
2086-
* current @eb_context.
2087-
*
2088-
* If not, we submit all the page(s) of the extent buffer.
2089-
*
2090-
* Return >0 if we have submitted the extent buffer successfully.
2091-
* Return 0 if we don't need to submit the page, as it's already submitted by
2092-
* previous call.
2093-
* Return <0 for fatal error.
2094-
*/
2095-
static int submit_eb_page(struct folio *folio, struct btrfs_eb_write_context *ctx)
2096-
{
2097-
struct writeback_control *wbc = ctx->wbc;
2098-
struct address_space *mapping = folio->mapping;
2099-
struct extent_buffer *eb;
2100-
int ret;
2101-
2102-
if (!folio_test_private(folio))
2103-
return 0;
2104-
2105-
if (btrfs_meta_is_subpage(folio_to_fs_info(folio)))
2106-
return submit_eb_subpage(folio, wbc);
2107-
2108-
spin_lock(&mapping->i_private_lock);
2109-
if (!folio_test_private(folio)) {
2110-
spin_unlock(&mapping->i_private_lock);
2111-
return 0;
2112-
}
2113-
2114-
eb = folio_get_private(folio);
2115-
2116-
/*
2117-
* Shouldn't happen and normally this would be a BUG_ON but no point
2118-
* crashing the machine for something we can survive anyway.
2119-
*/
2120-
if (WARN_ON(!eb)) {
2121-
spin_unlock(&mapping->i_private_lock);
2122-
return 0;
2123-
}
2124-
2125-
if (eb == ctx->eb) {
2126-
spin_unlock(&mapping->i_private_lock);
2127-
return 0;
2128-
}
2129-
ret = atomic_inc_not_zero(&eb->refs);
2130-
spin_unlock(&mapping->i_private_lock);
2131-
if (!ret)
2132-
return 0;
21332129

2134-
ctx->eb = eb;
2135-
2136-
ret = btrfs_check_meta_write_pointer(eb->fs_info, ctx);
2137-
if (ret) {
2138-
if (ret == -EBUSY)
2139-
ret = 0;
2140-
free_extent_buffer(eb);
2141-
return ret;
2142-
}
2143-
2144-
if (!lock_extent_buffer_for_io(eb, wbc)) {
2145-
free_extent_buffer(eb);
2146-
return 0;
2147-
}
2148-
/* Implies write in zoned mode. */
2149-
if (ctx->zoned_bg) {
2150-
/* Mark the last eb in the block group. */
2151-
btrfs_schedule_zone_finish_bg(ctx->zoned_bg, eb);
2152-
ctx->zoned_bg->meta_write_pointer += eb->len;
2130+
while ((eb = eb_batch_next(&batch)) != NULL)
2131+
wait_on_extent_buffer_writeback(eb);
2132+
eb_batch_release(&batch);
2133+
cond_resched();
21532134
}
2154-
write_one_eb(eb, wbc);
2155-
free_extent_buffer(eb);
2156-
return 1;
21572135
}
21582136

21592137
int btree_write_cache_pages(struct address_space *mapping,
@@ -2164,25 +2142,27 @@ int btree_write_cache_pages(struct address_space *mapping,
21642142
int ret = 0;
21652143
int done = 0;
21662144
int nr_to_write_done = 0;
2167-
struct folio_batch fbatch;
2168-
unsigned int nr_folios;
2169-
pgoff_t index;
2170-
pgoff_t end; /* Inclusive */
2145+
struct eb_batch batch;
2146+
unsigned int nr_ebs;
2147+
unsigned long index;
2148+
unsigned long end;
21712149
int scanned = 0;
21722150
xa_mark_t tag;
21732151

2174-
folio_batch_init(&fbatch);
2152+
eb_batch_init(&batch);
21752153
if (wbc->range_cyclic) {
2176-
index = mapping->writeback_index; /* Start from prev offset */
2154+
index = ((mapping->writeback_index << PAGE_SHIFT) >> fs_info->sectorsize_bits);
21772155
end = -1;
2156+
21782157
/*
21792158
* Start from the beginning does not need to cycle over the
21802159
* range, mark it as scanned.
21812160
*/
21822161
scanned = (index == 0);
21832162
} else {
2184-
index = wbc->range_start >> PAGE_SHIFT;
2185-
end = wbc->range_end >> PAGE_SHIFT;
2163+
index = (wbc->range_start >> fs_info->sectorsize_bits);
2164+
end = (wbc->range_end >> fs_info->sectorsize_bits);
2165+
21862166
scanned = 1;
21872167
}
21882168
if (wbc->sync_mode == WB_SYNC_ALL)
@@ -2192,31 +2172,40 @@ int btree_write_cache_pages(struct address_space *mapping,
21922172
btrfs_zoned_meta_io_lock(fs_info);
21932173
retry:
21942174
if (wbc->sync_mode == WB_SYNC_ALL)
2195-
tag_pages_for_writeback(mapping, index, end);
2175+
buffer_tree_tag_for_writeback(fs_info, index, end);
21962176
while (!done && !nr_to_write_done && (index <= end) &&
2197-
(nr_folios = filemap_get_folios_tag(mapping, &index, end,
2198-
tag, &fbatch))) {
2199-
unsigned i;
2177+
(nr_ebs = buffer_tree_get_ebs_tag(fs_info, &index, end, tag, &batch))) {
2178+
struct extent_buffer *eb;
22002179

2201-
for (i = 0; i < nr_folios; i++) {
2202-
struct folio *folio = fbatch.folios[i];
2180+
while ((eb = eb_batch_next(&batch)) != NULL) {
2181+
ctx.eb = eb;
2182+
2183+
ret = btrfs_check_meta_write_pointer(eb->fs_info, &ctx);
2184+
if (ret) {
2185+
if (ret == -EBUSY)
2186+
ret = 0;
22032187

2204-
ret = submit_eb_page(folio, &ctx);
2205-
if (ret == 0)
2188+
if (ret) {
2189+
done = 1;
2190+
break;
2191+
}
2192+
free_extent_buffer(eb);
22062193
continue;
2207-
if (ret < 0) {
2208-
done = 1;
2209-
break;
22102194
}
22112195

2212-
/*
2213-
* the filesystem may choose to bump up nr_to_write.
2214-
* We have to make sure to honor the new nr_to_write
2215-
* at any time
2216-
*/
2217-
nr_to_write_done = wbc->nr_to_write <= 0;
2196+
if (!lock_extent_buffer_for_io(eb, wbc))
2197+
continue;
2198+
2199+
/* Implies write in zoned mode. */
2200+
if (ctx.zoned_bg) {
2201+
/* Mark the last eb in the block group. */
2202+
btrfs_schedule_zone_finish_bg(ctx.zoned_bg, eb);
2203+
ctx.zoned_bg->meta_write_pointer += eb->len;
2204+
}
2205+
write_one_eb(eb, wbc);
22182206
}
2219-
folio_batch_release(&fbatch);
2207+
nr_to_write_done = (wbc->nr_to_write <= 0);
2208+
eb_batch_release(&batch);
22202209
cond_resched();
22212210
}
22222211
if (!scanned && !done) {

fs/btrfs/extent_io.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
240240
int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc);
241241
int btree_write_cache_pages(struct address_space *mapping,
242242
struct writeback_control *wbc);
243+
void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start, u64 end);
243244
void btrfs_readahead(struct readahead_control *rac);
244245
int set_folio_extent_mapped(struct folio *folio);
245246
void clear_folio_extent_mapped(struct folio *folio);

0 commit comments

Comments
 (0)