@@ -1213,6 +1213,35 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
12131213 if (em -> generation < newer_than )
12141214 goto next ;
12151215
1216+ /*
1217+ * Our start offset might be in the middle of an existing extent
1218+ * map, so take that into account.
1219+ */
1220+ range_len = em -> len - (cur - em -> start );
1221+ /*
1222+ * If this range of the extent map is already flagged for delalloc,
1223+ * skip it, because:
1224+ *
1225+ * 1) We could deadlock later, when trying to reserve space for
1226+ * delalloc, because in case we can't immediately reserve space
1227+ * the flusher can start delalloc and wait for the respective
1228+ * ordered extents to complete. The deadlock would happen
1229+ * because we do the space reservation while holding the range
1230+ * locked, and starting writeback, or finishing an ordered
1231+ * extent, requires locking the range;
1232+ *
1233+ * 2) If there's delalloc there, it means there's dirty pages for
1234+ * which writeback has not started yet (we clean the delalloc
1235+ * flag when starting writeback and after creating an ordered
1236+ * extent). If we mark pages in an adjacent range for defrag,
1237+ * then we will have a larger contiguous range for delalloc,
1238+ * very likely resulting in a larger extent after writeback is
1239+ * triggered (except in a case of free space fragmentation).
1240+ */
1241+ if (test_range_bit (& inode -> io_tree , cur , cur + range_len - 1 ,
1242+ EXTENT_DELALLOC , 0 , NULL ))
1243+ goto next ;
1244+
12161245 /*
12171246 * For do_compress case, we want to compress all valid file
12181247 * extents, thus no @extent_thresh or mergeable check.
@@ -1221,7 +1250,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
12211250 goto add ;
12221251
12231252 /* Skip too large extent */
1224- if (em -> len >= extent_thresh )
1253+ if (range_len >= extent_thresh )
12251254 goto next ;
12261255
12271256 next_mergeable = defrag_check_next_extent (& inode -> vfs_inode , em ,
@@ -1442,9 +1471,11 @@ static int defrag_one_cluster(struct btrfs_inode *inode,
14421471 list_for_each_entry (entry , & target_list , list ) {
14431472 u32 range_len = entry -> len ;
14441473
1445- /* Reached the limit */
1446- if (max_sectors && max_sectors == * sectors_defragged )
1474+ /* Reached or beyond the limit */
1475+ if (max_sectors && * sectors_defragged >= max_sectors ) {
1476+ ret = 1 ;
14471477 break ;
1478+ }
14481479
14491480 if (max_sectors )
14501481 range_len = min_t (u32 , range_len ,
@@ -1465,7 +1496,8 @@ static int defrag_one_cluster(struct btrfs_inode *inode,
14651496 extent_thresh , newer_than , do_compress );
14661497 if (ret < 0 )
14671498 break ;
1468- * sectors_defragged += range_len ;
1499+ * sectors_defragged += range_len >>
1500+ inode -> root -> fs_info -> sectorsize_bits ;
14691501 }
14701502out :
14711503 list_for_each_entry_safe (entry , tmp , & target_list , list ) {
@@ -1484,6 +1516,12 @@ static int defrag_one_cluster(struct btrfs_inode *inode,
14841516 * @newer_than: minimum transid to defrag
14851517 * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
14861518 * will be defragged.
1519+ *
1520+ * Return <0 for error.
1521+ * Return >=0 for the number of sectors defragged, and range->start will be updated
1522+ * to indicate the file offset where next defrag should be started at.
1523+ * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without
1524+ * defragging all the range).
14871525 */
14881526int btrfs_defrag_file (struct inode * inode , struct file_ra_state * ra ,
14891527 struct btrfs_ioctl_defrag_range_args * range ,
@@ -1499,6 +1537,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
14991537 int compress_type = BTRFS_COMPRESS_ZLIB ;
15001538 int ret = 0 ;
15011539 u32 extent_thresh = range -> extent_thresh ;
1540+ pgoff_t start_index ;
15021541
15031542 if (isize == 0 )
15041543 return 0 ;
@@ -1518,12 +1557,16 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
15181557
15191558 if (range -> start + range -> len > range -> start ) {
15201559 /* Got a specific range */
1521- last_byte = min (isize , range -> start + range -> len ) - 1 ;
1560+ last_byte = min (isize , range -> start + range -> len );
15221561 } else {
15231562 /* Defrag until file end */
1524- last_byte = isize - 1 ;
1563+ last_byte = isize ;
15251564 }
15261565
1566+ /* Align the range */
1567+ cur = round_down (range -> start , fs_info -> sectorsize );
1568+ last_byte = round_up (last_byte , fs_info -> sectorsize ) - 1 ;
1569+
15271570 /*
15281571 * If we were not given a ra, allocate a readahead context. As
15291572 * readahead is just an optimization, defrag will work without it so
@@ -1536,16 +1579,26 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
15361579 file_ra_state_init (ra , inode -> i_mapping );
15371580 }
15381581
1539- /* Align the range */
1540- cur = round_down (range -> start , fs_info -> sectorsize );
1541- last_byte = round_up (last_byte , fs_info -> sectorsize ) - 1 ;
1582+ /*
1583+ * Make writeback start from the beginning of the range, so that the
1584+ * defrag range can be written sequentially.
1585+ */
1586+ start_index = cur >> PAGE_SHIFT ;
1587+ if (start_index < inode -> i_mapping -> writeback_index )
1588+ inode -> i_mapping -> writeback_index = start_index ;
15421589
15431590 while (cur < last_byte ) {
1591+ const unsigned long prev_sectors_defragged = sectors_defragged ;
15441592 u64 cluster_end ;
15451593
15461594 /* The cluster size 256K should always be page aligned */
15471595 BUILD_BUG_ON (!IS_ALIGNED (CLUSTER_SIZE , PAGE_SIZE ));
15481596
1597+ if (btrfs_defrag_cancelled (fs_info )) {
1598+ ret = - EAGAIN ;
1599+ break ;
1600+ }
1601+
15491602 /* We want the cluster end at page boundary when possible */
15501603 cluster_end = (((cur >> PAGE_SHIFT ) +
15511604 (SZ_256K >> PAGE_SHIFT )) << PAGE_SHIFT ) - 1 ;
@@ -1567,14 +1620,27 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
15671620 cluster_end + 1 - cur , extent_thresh ,
15681621 newer_than , do_compress ,
15691622 & sectors_defragged , max_to_defrag );
1623+
1624+ if (sectors_defragged > prev_sectors_defragged )
1625+ balance_dirty_pages_ratelimited (inode -> i_mapping );
1626+
15701627 btrfs_inode_unlock (inode , 0 );
15711628 if (ret < 0 )
15721629 break ;
15731630 cur = cluster_end + 1 ;
1631+ if (ret > 0 ) {
1632+ ret = 0 ;
1633+ break ;
1634+ }
15741635 }
15751636
15761637 if (ra_allocated )
15771638 kfree (ra );
1639+ /*
1640+ * Update range.start for autodefrag, this will indicate where to start
1641+ * in next run.
1642+ */
1643+ range -> start = cur ;
15781644 if (sectors_defragged ) {
15791645 /*
15801646 * We have defragged some sectors, for compression case they
0 commit comments