Skip to content

Commit 43683bb

Browse files
puranjaymohangregkh
authored andcommitted
bpf: fix mm lifecycle in open-coded task_vma iterator
[ Upstream commit d8e27d2 ] The open-coded task_vma iterator reads task->mm locklessly and acquires mmap_read_trylock() but never calls mmget(). If the task exits concurrently, the mm_struct can be freed as it is not SLAB_TYPESAFE_BY_RCU, resulting in a use-after-free. Safely read task->mm with a trylock on alloc_lock and acquire an mm reference. Drop the reference via bpf_iter_mmput_async() in _destroy() and error paths. bpf_iter_mmput_async() is a local wrapper around mmput_async() with a fallback to mmput() on !CONFIG_MMU. Reject irqs-disabled contexts (including NMI) up front. Operations used by _next() and _destroy() (mmap_read_unlock, bpf_iter_mmput_async) take spinlocks with IRQs disabled (pool->lock, pi_lock). Running from NMI or from a tracepoint that fires with those locks held could deadlock. A trylock on alloc_lock is used instead of the blocking task_lock() (get_task_mm) to avoid a deadlock when a softirq BPF program iterates a task that already holds its alloc_lock on the same CPU. Fixes: 4ac4546 ("bpf: Introduce task_vma open-coded iterator kfuncs") Signed-off-by: Puranjay Mohan <puranjay@kernel.org> Link: https://lore.kernel.org/r/20260408154539.3832150-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: Sasha Levin <sashal@kernel.org>
1 parent 4e70ba1 commit 43683bb

1 file changed

Lines changed: 51 additions & 3 deletions

File tree

kernel/bpf/task_iter.c

Lines changed: 51 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include <linux/bpf_mem_alloc.h>
1010
#include <linux/btf_ids.h>
1111
#include <linux/mm_types.h>
12+
#include <linux/sched/mm.h>
1213
#include "mmap_unlock_work.h"
1314

1415
static const char * const iter_task_type_names[] = {
@@ -794,6 +795,15 @@ const struct bpf_func_proto bpf_find_vma_proto = {
794795
.arg5_type = ARG_ANYTHING,
795796
};
796797

798+
static inline void bpf_iter_mmput_async(struct mm_struct *mm)
799+
{
800+
#ifdef CONFIG_MMU
801+
mmput_async(mm);
802+
#else
803+
mmput(mm);
804+
#endif
805+
}
806+
797807
struct bpf_iter_task_vma_kern_data {
798808
struct task_struct *task;
799809
struct mm_struct *mm;
@@ -825,6 +835,24 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
825835
BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma));
826836
BUILD_BUG_ON(__alignof__(struct bpf_iter_task_vma_kern) != __alignof__(struct bpf_iter_task_vma));
827837

838+
/* bpf_iter_mmput_async() needs mmput_async() which requires CONFIG_MMU */
839+
if (!IS_ENABLED(CONFIG_MMU)) {
840+
kit->data = NULL;
841+
return -EOPNOTSUPP;
842+
}
843+
844+
/*
845+
* Reject irqs-disabled contexts including NMI. Operations used
846+
* by _next() and _destroy() (mmap_read_unlock, bpf_iter_mmput_async)
847+
* can take spinlocks with IRQs disabled (pi_lock, pool->lock).
848+
* Running from NMI or from a tracepoint that fires with those
849+
* locks held could deadlock.
850+
*/
851+
if (irqs_disabled()) {
852+
kit->data = NULL;
853+
return -EBUSY;
854+
}
855+
828856
/* is_iter_reg_valid_uninit guarantees that kit hasn't been initialized
829857
* before, so non-NULL kit->data doesn't point to previously
830858
* bpf_mem_alloc'd bpf_iter_task_vma_kern_data
@@ -834,7 +862,25 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
834862
return -ENOMEM;
835863

836864
kit->data->task = get_task_struct(task);
865+
/*
866+
* Safely read task->mm and acquire an mm reference.
867+
*
868+
* Cannot use get_task_mm() because its task_lock() is a
869+
* blocking spin_lock that would deadlock if the target task
870+
* already holds alloc_lock on this CPU (e.g. a softirq BPF
871+
* program iterating a task interrupted while holding its
872+
* alloc_lock).
873+
*/
874+
if (!spin_trylock(&task->alloc_lock)) {
875+
err = -EBUSY;
876+
goto err_cleanup_iter;
877+
}
837878
kit->data->mm = task->mm;
879+
if (kit->data->mm && !(task->flags & PF_KTHREAD))
880+
mmget(kit->data->mm);
881+
else
882+
kit->data->mm = NULL;
883+
spin_unlock(&task->alloc_lock);
838884
if (!kit->data->mm) {
839885
err = -ENOENT;
840886
goto err_cleanup_iter;
@@ -844,15 +890,16 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
844890
irq_work_busy = bpf_mmap_unlock_get_irq_work(&kit->data->work);
845891
if (irq_work_busy || !mmap_read_trylock(kit->data->mm)) {
846892
err = -EBUSY;
847-
goto err_cleanup_iter;
893+
goto err_cleanup_mmget;
848894
}
849895

850896
vma_iter_init(&kit->data->vmi, kit->data->mm, addr);
851897
return 0;
852898

899+
err_cleanup_mmget:
900+
bpf_iter_mmput_async(kit->data->mm);
853901
err_cleanup_iter:
854-
if (kit->data->task)
855-
put_task_struct(kit->data->task);
902+
put_task_struct(kit->data->task);
856903
bpf_mem_free(&bpf_global_ma, kit->data);
857904
/* NULL kit->data signals failed bpf_iter_task_vma initialization */
858905
kit->data = NULL;
@@ -875,6 +922,7 @@ __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it)
875922
if (kit->data) {
876923
bpf_mmap_unlock_mm(kit->data->work, kit->data->mm);
877924
put_task_struct(kit->data->task);
925+
bpf_iter_mmput_async(kit->data->mm);
878926
bpf_mem_free(&bpf_global_ma, kit->data);
879927
}
880928
}

0 commit comments

Comments
 (0)