Skip to content

Commit 3745834

Browse files
puranjaymohangregkh
authored andcommitted
bpf: switch task_vma iterator from mmap_lock to per-VMA locks
[ Upstream commit bee9ef4 ] The open-coded task_vma iterator holds mmap_lock for the entire duration of iteration, increasing contention on this highly contended lock. Switch to per-VMA locking. Find the next VMA via an RCU-protected maple tree walk and lock it with lock_vma_under_rcu(). lock_next_vma() is not used because its fallback takes mmap_read_lock(), and the iterator must work in non-sleepable contexts. lock_vma_under_rcu() is a point lookup (mas_walk) that finds the VMA containing a given address but cannot iterate across gaps. An RCU-protected vma_next() walk (mas_find) first locates the next VMA's vm_start to pass to lock_vma_under_rcu(). Between the RCU walk and the lock, the VMA may be removed, shrunk, or write-locked. On failure, advance past it using vm_end from the RCU walk. Because the VMA slab is SLAB_TYPESAFE_BY_RCU, vm_end may be stale; fall back to PAGE_SIZE advancement when it does not make forward progress. Concurrent VMA insertions at addresses already passed by the iterator are not detected. CONFIG_PER_VMA_LOCK is required; return -EOPNOTSUPP without it. Signed-off-by: Puranjay Mohan <puranjay@kernel.org> Link: https://lore.kernel.org/r/20260408154539.3832150-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov <ast@kernel.org> Stable-dep-of: 4cbee02 ("bpf: return VMA snapshot from task_vma iterator") Signed-off-by: Sasha Levin <sashal@kernel.org>
1 parent d0862de commit 3745834

1 file changed

Lines changed: 73 additions & 18 deletions

File tree

kernel/bpf/task_iter.c

Lines changed: 73 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include <linux/bpf_mem_alloc.h>
1010
#include <linux/btf_ids.h>
1111
#include <linux/mm_types.h>
12+
#include <linux/mmap_lock.h>
1213
#include <linux/sched/mm.h>
1314
#include "mmap_unlock_work.h"
1415

@@ -807,8 +808,8 @@ static inline void bpf_iter_mmput_async(struct mm_struct *mm)
807808
struct bpf_iter_task_vma_kern_data {
808809
struct task_struct *task;
809810
struct mm_struct *mm;
810-
struct mmap_unlock_irq_work *work;
811-
struct vma_iterator vmi;
811+
struct vm_area_struct *locked_vma;
812+
u64 next_addr;
812813
};
813814

814815
struct bpf_iter_task_vma {
@@ -829,21 +830,19 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
829830
struct task_struct *task, u64 addr)
830831
{
831832
struct bpf_iter_task_vma_kern *kit = (void *)it;
832-
bool irq_work_busy = false;
833833
int err;
834834

835835
BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma));
836836
BUILD_BUG_ON(__alignof__(struct bpf_iter_task_vma_kern) != __alignof__(struct bpf_iter_task_vma));
837837

838-
/* bpf_iter_mmput_async() needs mmput_async() which requires CONFIG_MMU */
839-
if (!IS_ENABLED(CONFIG_MMU)) {
838+
if (!IS_ENABLED(CONFIG_PER_VMA_LOCK)) {
840839
kit->data = NULL;
841840
return -EOPNOTSUPP;
842841
}
843842

844843
/*
845844
* Reject irqs-disabled contexts including NMI. Operations used
846-
* by _next() and _destroy() (mmap_read_unlock, bpf_iter_mmput_async)
845+
* by _next() and _destroy() (vma_end_read, bpf_iter_mmput_async)
847846
* can take spinlocks with IRQs disabled (pi_lock, pool->lock).
848847
* Running from NMI or from a tracepoint that fires with those
849848
* locks held could deadlock.
@@ -886,18 +885,10 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
886885
goto err_cleanup_iter;
887886
}
888887

889-
/* kit->data->work == NULL is valid after bpf_mmap_unlock_get_irq_work */
890-
irq_work_busy = bpf_mmap_unlock_get_irq_work(&kit->data->work);
891-
if (irq_work_busy || !mmap_read_trylock(kit->data->mm)) {
892-
err = -EBUSY;
893-
goto err_cleanup_mmget;
894-
}
895-
896-
vma_iter_init(&kit->data->vmi, kit->data->mm, addr);
888+
kit->data->locked_vma = NULL;
889+
kit->data->next_addr = addr;
897890
return 0;
898891

899-
err_cleanup_mmget:
900-
bpf_iter_mmput_async(kit->data->mm);
901892
err_cleanup_iter:
902893
put_task_struct(kit->data->task);
903894
bpf_mem_free(&bpf_global_ma, kit->data);
@@ -906,21 +897,85 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
906897
return err;
907898
}
908899

900+
/*
901+
* Find and lock the next VMA at or after data->next_addr.
902+
*
903+
* lock_vma_under_rcu() is a point lookup (mas_walk): it finds the VMA
904+
* containing a given address but cannot iterate. An RCU-protected
905+
* maple tree walk with vma_next() (mas_find) is needed first to locate
906+
* the next VMA's vm_start across any gap.
907+
*
908+
* Between the RCU walk and the lock, the VMA may be removed, shrunk,
909+
* or write-locked. On failure, advance past it using vm_end from the
910+
* RCU walk. SLAB_TYPESAFE_BY_RCU can make vm_end stale, so fall back
911+
* to PAGE_SIZE advancement to guarantee forward progress.
912+
*/
913+
static struct vm_area_struct *
914+
bpf_iter_task_vma_find_next(struct bpf_iter_task_vma_kern_data *data)
915+
{
916+
struct vm_area_struct *vma;
917+
struct vma_iterator vmi;
918+
unsigned long start, end;
919+
920+
retry:
921+
rcu_read_lock();
922+
vma_iter_init(&vmi, data->mm, data->next_addr);
923+
vma = vma_next(&vmi);
924+
if (!vma) {
925+
rcu_read_unlock();
926+
return NULL;
927+
}
928+
start = vma->vm_start;
929+
end = vma->vm_end;
930+
rcu_read_unlock();
931+
932+
vma = lock_vma_under_rcu(data->mm, start);
933+
if (!vma) {
934+
if (end <= data->next_addr)
935+
data->next_addr += PAGE_SIZE;
936+
else
937+
data->next_addr = end;
938+
goto retry;
939+
}
940+
941+
if (unlikely(vma->vm_end <= data->next_addr)) {
942+
data->next_addr += PAGE_SIZE;
943+
vma_end_read(vma);
944+
goto retry;
945+
}
946+
947+
return vma;
948+
}
949+
909950
__bpf_kfunc struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it)
910951
{
911952
struct bpf_iter_task_vma_kern *kit = (void *)it;
953+
struct vm_area_struct *vma;
912954

913955
if (!kit->data) /* bpf_iter_task_vma_new failed */
914956
return NULL;
915-
return vma_next(&kit->data->vmi);
957+
958+
if (kit->data->locked_vma) {
959+
vma_end_read(kit->data->locked_vma);
960+
kit->data->locked_vma = NULL;
961+
}
962+
963+
vma = bpf_iter_task_vma_find_next(kit->data);
964+
if (!vma)
965+
return NULL;
966+
967+
kit->data->locked_vma = vma;
968+
kit->data->next_addr = vma->vm_end;
969+
return vma;
916970
}
917971

918972
__bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it)
919973
{
920974
struct bpf_iter_task_vma_kern *kit = (void *)it;
921975

922976
if (kit->data) {
923-
bpf_mmap_unlock_mm(kit->data->work, kit->data->mm);
977+
if (kit->data->locked_vma)
978+
vma_end_read(kit->data->locked_vma);
924979
put_task_struct(kit->data->task);
925980
bpf_iter_mmput_async(kit->data->mm);
926981
bpf_mem_free(&bpf_global_ma, kit->data);

0 commit comments

Comments
 (0)