@@ -99,6 +99,7 @@ struct bpf_mem_cache {
9999 int low_watermark , high_watermark , batch ;
100100 int percpu_size ;
101101 bool draining ;
102+ struct bpf_mem_cache * tgt ;
102103
103104 /* list of objects to be freed after RCU tasks trace GP */
104105 struct llist_head free_by_rcu_ttrace ;
@@ -199,18 +200,11 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)
199200
200201 for (i = 0 ; i < cnt ; i ++ ) {
201202 /*
202- * free_by_rcu_ttrace is only manipulated by irq work refill_work().
203- * IRQ works on the same CPU are called sequentially, so it is
204- * safe to use __llist_del_first() here. If alloc_bulk() is
205- * invoked by the initial prefill, there will be no running
206- * refill_work(), so __llist_del_first() is fine as well.
207- *
208- * In most cases, objects on free_by_rcu_ttrace are from the same CPU.
209- * If some objects come from other CPUs, it doesn't incur any
210- * harm because NUMA_NO_NODE means the preference for current
211- * numa node and it is not a guarantee.
203+ * For every 'c' llist_del_first(&c->free_by_rcu_ttrace); is
204+ * done only by one CPU == current CPU. Other CPUs might
205+ * llist_add() and llist_del_all() in parallel.
212206 */
213- obj = __llist_del_first (& c -> free_by_rcu_ttrace );
207+ obj = llist_del_first (& c -> free_by_rcu_ttrace );
214208 if (!obj )
215209 break ;
216210 add_obj_to_free_list (c , obj );
@@ -284,18 +278,23 @@ static void enque_to_free(struct bpf_mem_cache *c, void *obj)
284278 /* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work.
285279 * Nothing races to add to free_by_rcu_ttrace list.
286280 */
287- __llist_add (llnode , & c -> free_by_rcu_ttrace );
281+ llist_add (llnode , & c -> free_by_rcu_ttrace );
288282}
289283
290284static void do_call_rcu_ttrace (struct bpf_mem_cache * c )
291285{
292286 struct llist_node * llnode , * t ;
293287
294- if (atomic_xchg (& c -> call_rcu_ttrace_in_progress , 1 ))
288+ if (atomic_xchg (& c -> call_rcu_ttrace_in_progress , 1 )) {
289+ if (unlikely (READ_ONCE (c -> draining ))) {
290+ llnode = llist_del_all (& c -> free_by_rcu_ttrace );
291+ free_all (llnode , !!c -> percpu_size );
292+ }
295293 return ;
294+ }
296295
297296 WARN_ON_ONCE (!llist_empty (& c -> waiting_for_gp_ttrace ));
298- llist_for_each_safe (llnode , t , __llist_del_all (& c -> free_by_rcu_ttrace ))
297+ llist_for_each_safe (llnode , t , llist_del_all (& c -> free_by_rcu_ttrace ))
299298 /* There is no concurrent __llist_add(waiting_for_gp_ttrace) access.
300299 * It doesn't race with llist_del_all either.
301300 * But there could be two concurrent llist_del_all(waiting_for_gp_ttrace):
@@ -318,10 +317,13 @@ static void do_call_rcu_ttrace(struct bpf_mem_cache *c)
318317
319318static void free_bulk (struct bpf_mem_cache * c )
320319{
320+ struct bpf_mem_cache * tgt = c -> tgt ;
321321 struct llist_node * llnode , * t ;
322322 unsigned long flags ;
323323 int cnt ;
324324
325+ WARN_ON_ONCE (tgt -> unit_size != c -> unit_size );
326+
325327 do {
326328 inc_active (c , & flags );
327329 llnode = __llist_del_first (& c -> free_llist );
@@ -331,13 +333,13 @@ static void free_bulk(struct bpf_mem_cache *c)
331333 cnt = 0 ;
332334 dec_active (c , flags );
333335 if (llnode )
334- enque_to_free (c , llnode );
336+ enque_to_free (tgt , llnode );
335337 } while (cnt > (c -> high_watermark + c -> low_watermark ) / 2 );
336338
337339 /* and drain free_llist_extra */
338340 llist_for_each_safe (llnode , t , llist_del_all (& c -> free_llist_extra ))
339- enque_to_free (c , llnode );
340- do_call_rcu_ttrace (c );
341+ enque_to_free (tgt , llnode );
342+ do_call_rcu_ttrace (tgt );
341343}
342344
343345static void bpf_mem_refill (struct irq_work * work )
@@ -436,6 +438,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
436438 c -> unit_size = unit_size ;
437439 c -> objcg = objcg ;
438440 c -> percpu_size = percpu_size ;
441+ c -> tgt = c ;
439442 prefill_mem_cache (c , cpu );
440443 }
441444 ma -> cache = pc ;
@@ -458,6 +461,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
458461 c = & cc -> cache [i ];
459462 c -> unit_size = sizes [i ];
460463 c -> objcg = objcg ;
464+ c -> tgt = c ;
461465 prefill_mem_cache (c , cpu );
462466 }
463467 }
@@ -476,7 +480,7 @@ static void drain_mem_cache(struct bpf_mem_cache *c)
476480 * Except for waiting_for_gp_ttrace list, there are no concurrent operations
477481 * on these lists, so it is safe to use __llist_del_all().
478482 */
479- free_all (__llist_del_all (& c -> free_by_rcu_ttrace ), percpu );
483+ free_all (llist_del_all (& c -> free_by_rcu_ttrace ), percpu );
480484 free_all (llist_del_all (& c -> waiting_for_gp_ttrace ), percpu );
481485 free_all (__llist_del_all (& c -> free_llist ), percpu );
482486 free_all (__llist_del_all (& c -> free_llist_extra ), percpu );
@@ -601,8 +605,10 @@ static void notrace *unit_alloc(struct bpf_mem_cache *c)
601605 local_irq_save (flags );
602606 if (local_inc_return (& c -> active ) == 1 ) {
603607 llnode = __llist_del_first (& c -> free_llist );
604- if (llnode )
608+ if (llnode ) {
605609 cnt = -- c -> free_cnt ;
610+ * (struct bpf_mem_cache * * )llnode = c ;
611+ }
606612 }
607613 local_dec (& c -> active );
608614 local_irq_restore (flags );
@@ -626,6 +632,12 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
626632
627633 BUILD_BUG_ON (LLIST_NODE_SZ > 8 );
628634
635+ /*
636+ * Remember bpf_mem_cache that allocated this object.
637+ * The hint is not accurate.
638+ */
639+ c -> tgt = * (struct bpf_mem_cache * * )llnode ;
640+
629641 local_irq_save (flags );
630642 if (local_inc_return (& c -> active ) == 1 ) {
631643 __llist_add (llnode , & c -> free_llist );
0 commit comments