@@ -1028,7 +1028,14 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
10281028 return ret ;
10291029}
10301030
1031- static long btrfs_scan_inode (struct btrfs_inode * inode , long * scanned , long nr_to_scan )
1031+ struct btrfs_em_shrink_ctx {
1032+ long nr_to_scan ;
1033+ long scanned ;
1034+ u64 last_ino ;
1035+ u64 last_root ;
1036+ };
1037+
1038+ static long btrfs_scan_inode (struct btrfs_inode * inode , struct btrfs_em_shrink_ctx * ctx )
10321039{
10331040 const u64 cur_fs_gen = btrfs_get_fs_generation (inode -> root -> fs_info );
10341041 struct extent_map_tree * tree = & inode -> extent_tree ;
@@ -1057,14 +1064,25 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_t
10571064 if (!down_read_trylock (& inode -> i_mmap_lock ))
10581065 return 0 ;
10591066
1060- write_lock (& tree -> lock );
1067+ /*
1068+ * We want to be fast because we can be called from any path trying to
1069+ * allocate memory, so if the lock is busy we don't want to spend time
1070+ * waiting for it - either some task is about to do IO for the inode or
1071+ * we may have another task shrinking extent maps, here in this code, so
1072+ * skip this inode.
1073+ */
1074+ if (!write_trylock (& tree -> lock )) {
1075+ up_read (& inode -> i_mmap_lock );
1076+ return 0 ;
1077+ }
1078+
10611079 node = rb_first_cached (& tree -> map );
10621080 while (node ) {
10631081 struct extent_map * em ;
10641082
10651083 em = rb_entry (node , struct extent_map , rb_node );
10661084 node = rb_next (node );
1067- ( * scanned ) ++ ;
1085+ ctx -> scanned ++ ;
10681086
10691087 if (em -> flags & EXTENT_FLAG_PINNED )
10701088 goto next ;
@@ -1085,42 +1103,49 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_t
10851103 free_extent_map (em );
10861104 nr_dropped ++ ;
10871105next :
1088- if (* scanned >= nr_to_scan )
1106+ if (ctx -> scanned >= ctx -> nr_to_scan )
10891107 break ;
10901108
10911109 /*
1092- * Restart if we had to reschedule, and any extent maps that were
1093- * pinned before may have become unpinned after we released the
1094- * lock and took it again.
1110+ * Stop if we need to reschedule or there's contention on the
1111+ * lock. This is to avoid slowing other tasks trying to take the
1112+ * lock and because the shrinker might be called during a memory
1113+ * allocation path and we want to avoid taking a very long time
1114+ * and slowing down all sorts of tasks.
10951115 */
1096- if (cond_resched_rwlock_write (& tree -> lock ))
1097- node = rb_first_cached ( & tree -> map ) ;
1116+ if (need_resched () || rwlock_needbreak (& tree -> lock ))
1117+ break ;
10981118 }
10991119 write_unlock (& tree -> lock );
11001120 up_read (& inode -> i_mmap_lock );
11011121
11021122 return nr_dropped ;
11031123}
11041124
1105- static long btrfs_scan_root (struct btrfs_root * root , long * scanned , long nr_to_scan )
1125+ static long btrfs_scan_root (struct btrfs_root * root , struct btrfs_em_shrink_ctx * ctx )
11061126{
1107- struct btrfs_fs_info * fs_info = root -> fs_info ;
11081127 struct btrfs_inode * inode ;
11091128 long nr_dropped = 0 ;
1110- u64 min_ino = fs_info -> extent_map_shrinker_last_ino + 1 ;
1129+ u64 min_ino = ctx -> last_ino + 1 ;
11111130
11121131 inode = btrfs_find_first_inode (root , min_ino );
11131132 while (inode ) {
1114- nr_dropped += btrfs_scan_inode (inode , scanned , nr_to_scan );
1133+ nr_dropped += btrfs_scan_inode (inode , ctx );
11151134
11161135 min_ino = btrfs_ino (inode ) + 1 ;
1117- fs_info -> extent_map_shrinker_last_ino = btrfs_ino (inode );
1118- iput ( & inode -> vfs_inode );
1136+ ctx -> last_ino = btrfs_ino (inode );
1137+ btrfs_add_delayed_iput ( inode );
11191138
1120- if (* scanned >= nr_to_scan )
1139+ if (ctx -> scanned >= ctx -> nr_to_scan )
1140+ break ;
1141+
1142+ /*
1143+ * We may be called from memory allocation paths, so we don't
1144+ * want to take too much time and slowdown tasks.
1145+ */
1146+ if (need_resched ())
11211147 break ;
11221148
1123- cond_resched ();
11241149 inode = btrfs_find_first_inode (root , min_ino );
11251150 }
11261151
@@ -1132,34 +1157,56 @@ static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_s
11321157 * inode if there is one or we will find out this was the last
11331158 * one and move to the next root.
11341159 */
1135- fs_info -> extent_map_shrinker_last_root = btrfs_root_id (root );
1160+ ctx -> last_root = btrfs_root_id (root );
11361161 } else {
11371162 /*
11381163 * No more inodes in this root, set extent_map_shrinker_last_ino to 0 so
11391164 * that when processing the next root we start from its first inode.
11401165 */
1141- fs_info -> extent_map_shrinker_last_ino = 0 ;
1142- fs_info -> extent_map_shrinker_last_root = btrfs_root_id (root ) + 1 ;
1166+ ctx -> last_ino = 0 ;
1167+ ctx -> last_root = btrfs_root_id (root ) + 1 ;
11431168 }
11441169
11451170 return nr_dropped ;
11461171}
11471172
11481173long btrfs_free_extent_maps (struct btrfs_fs_info * fs_info , long nr_to_scan )
11491174{
1150- const u64 start_root_id = fs_info -> extent_map_shrinker_last_root ;
1151- u64 next_root_id = start_root_id ;
1175+ struct btrfs_em_shrink_ctx ctx ;
1176+ u64 start_root_id ;
1177+ u64 next_root_id ;
11521178 bool cycled = false;
11531179 long nr_dropped = 0 ;
1154- long scanned = 0 ;
1180+
1181+ ctx .scanned = 0 ;
1182+ ctx .nr_to_scan = nr_to_scan ;
1183+
1184+ /*
1185+ * In case we have multiple tasks running this shrinker, make the next
1186+ * one start from the next inode in case it starts before we finish.
1187+ */
1188+ spin_lock (& fs_info -> extent_map_shrinker_lock );
1189+ ctx .last_ino = fs_info -> extent_map_shrinker_last_ino ;
1190+ fs_info -> extent_map_shrinker_last_ino ++ ;
1191+ ctx .last_root = fs_info -> extent_map_shrinker_last_root ;
1192+ spin_unlock (& fs_info -> extent_map_shrinker_lock );
1193+
1194+ start_root_id = ctx .last_root ;
1195+ next_root_id = ctx .last_root ;
11551196
11561197 if (trace_btrfs_extent_map_shrinker_scan_enter_enabled ()) {
11571198 s64 nr = percpu_counter_sum_positive (& fs_info -> evictable_extent_maps );
11581199
1159- trace_btrfs_extent_map_shrinker_scan_enter (fs_info , nr_to_scan , nr );
1200+ trace_btrfs_extent_map_shrinker_scan_enter (fs_info , nr_to_scan ,
1201+ nr , ctx .last_root ,
1202+ ctx .last_ino );
11601203 }
11611204
1162- while (scanned < nr_to_scan ) {
1205+ /*
1206+ * We may be called from memory allocation paths, so we don't want to
1207+ * take too much time and slowdown tasks, so stop if we need reschedule.
1208+ */
1209+ while (ctx .scanned < ctx .nr_to_scan && !need_resched ()) {
11631210 struct btrfs_root * root ;
11641211 unsigned long count ;
11651212
@@ -1171,8 +1218,8 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
11711218 spin_unlock (& fs_info -> fs_roots_radix_lock );
11721219 if (start_root_id > 0 && !cycled ) {
11731220 next_root_id = 0 ;
1174- fs_info -> extent_map_shrinker_last_root = 0 ;
1175- fs_info -> extent_map_shrinker_last_ino = 0 ;
1221+ ctx . last_root = 0 ;
1222+ ctx . last_ino = 0 ;
11761223 cycled = true;
11771224 continue ;
11781225 }
@@ -1186,15 +1233,33 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
11861233 continue ;
11871234
11881235 if (is_fstree (btrfs_root_id (root )))
1189- nr_dropped += btrfs_scan_root (root , & scanned , nr_to_scan );
1236+ nr_dropped += btrfs_scan_root (root , & ctx );
11901237
11911238 btrfs_put_root (root );
11921239 }
11931240
1241+ /*
1242+ * In case of multiple tasks running this extent map shrinking code this
1243+ * isn't perfect but it's simple and silences things like KCSAN. It's
1244+ * not possible to know which task made more progress because we can
1245+ * cycle back to the first root and first inode if it's not the first
1246+ * time the shrinker ran, see the above logic. Also a task that started
1247+ * later may finish ealier than another task and made less progress. So
1248+ * make this simple and update to the progress of the last task that
1249+ * finished, with the occasional possiblity of having two consecutive
1250+ * runs of the shrinker process the same inodes.
1251+ */
1252+ spin_lock (& fs_info -> extent_map_shrinker_lock );
1253+ fs_info -> extent_map_shrinker_last_ino = ctx .last_ino ;
1254+ fs_info -> extent_map_shrinker_last_root = ctx .last_root ;
1255+ spin_unlock (& fs_info -> extent_map_shrinker_lock );
1256+
11941257 if (trace_btrfs_extent_map_shrinker_scan_exit_enabled ()) {
11951258 s64 nr = percpu_counter_sum_positive (& fs_info -> evictable_extent_maps );
11961259
1197- trace_btrfs_extent_map_shrinker_scan_exit (fs_info , nr_dropped , nr );
1260+ trace_btrfs_extent_map_shrinker_scan_exit (fs_info , nr_dropped ,
1261+ nr , ctx .last_root ,
1262+ ctx .last_ino );
11981263 }
11991264
12001265 return nr_dropped ;
0 commit comments