Skip to content

Commit 96d81e4

Browse files
lorenzo-stoakesakpm00
authored andcommitted
mm/pagewalk: split walk_page_range_novma() into kernel/user parts
walk_page_range_novma() is rather confusing - it supports two modes, one used often, the other used only for debugging. The first mode is the common case of traversal of kernel page tables, which is what nearly all callers use this for. Secondly it provides an unusual debugging interface that allows for the traversal of page tables in a userland range of memory even for that memory which is not described by a VMA. It is far from certain that such page tables should even exist, but perhaps this is precisely why it is useful as a debugging mechanism. As a result, this is utilised by ptdump only. Historically, things were reversed - ptdump was the only user, and other parts of the kernel evolved to use the kernel page table walking here. Since we have some complicated and confusing locking rules for the novma case, it makes sense to separate the two usages into their own functions. Doing this also provide self-documentation as to the intent of the caller - are they doing something rather unusual or are they simply doing a standard kernel page table walk? We therefore establish two separate functions - walk_page_range_debug() for this single usage, and walk_kernel_page_table_range() for general kernel page table walking. The walk_page_range_debug() function is currently used to traverse both userland and kernel mappings, so we maintain this and in the case of kernel mappings being traversed, we have walk_page_range_debug() invoke walk_kernel_page_table_range() internally. We additionally make walk_page_range_debug() internal to mm. Link: https://lkml.kernel.org/r/20250605135104.90720-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org> Acked-by: Qi Zheng <zhengqi.arch@bytedance.com> Reviewed-by: Oscar Salvador <osalvador@suse.de> Reviewed-by: Suren Baghdasaryan <surenb@google.com> Reviewed-by: Vlastimil Babka <vbabka@suse.cz> Acked-by: David Hildenbrand <david@redhat.com> Cc: Albert Ou <aou@eecs.berkeley.edu> Cc: Alexandre Ghiti <alex@ghiti.fr> Cc: Barry Song <baohua@kernel.org> Cc: Huacai Chen <chenhuacai@kernel.org> Cc: Jann Horn <jannh@google.com> Cc: Jonas Bonn <jonas@southpole.se> Cc: Liam Howlett <liam.howlett@oracle.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Muchun Song <muchun.song@linux.dev> Cc: Palmer Dabbelt <palmer@dabbelt.com> Cc: Paul Walmsley <paul.walmsley@sifive.com> Cc: Stafford Horne <shorne@gmail.com> Cc: Stefan Kristiansson <stefan.kristiansson@saunalahti.fi> Cc: WANG Xuerui <kernel@xen0n.name> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
1 parent 03dfefd commit 96d81e4

File tree

8 files changed

+71
-35
lines changed

8 files changed

+71
-35
lines changed

arch/loongarch/mm/pageattr.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ static int __set_memory(unsigned long addr, int numpages, pgprot_t set_mask, pgp
118118
return 0;
119119

120120
mmap_write_lock(&init_mm);
121-
ret = walk_page_range_novma(&init_mm, start, end, &pageattr_ops, NULL, &masks);
121+
ret = walk_kernel_page_table_range(start, end, &pageattr_ops, NULL, &masks);
122122
mmap_write_unlock(&init_mm);
123123

124124
flush_tlb_kernel_range(start, end);

arch/openrisc/kernel/dma.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ void *arch_dma_set_uncached(void *cpu_addr, size_t size)
7272
* them and setting the cache-inhibit bit.
7373
*/
7474
mmap_write_lock(&init_mm);
75-
error = walk_page_range_novma(&init_mm, va, va + size,
75+
error = walk_kernel_page_table_range(va, va + size,
7676
&set_nocache_walk_ops, NULL, NULL);
7777
mmap_write_unlock(&init_mm);
7878

@@ -87,7 +87,7 @@ void arch_dma_clear_uncached(void *cpu_addr, size_t size)
8787

8888
mmap_write_lock(&init_mm);
8989
/* walk_page_range shouldn't be able to fail here */
90-
WARN_ON(walk_page_range_novma(&init_mm, va, va + size,
90+
WARN_ON(walk_kernel_page_table_range(va, va + size,
9191
&clear_nocache_walk_ops, NULL, NULL));
9292
mmap_write_unlock(&init_mm);
9393
}

arch/riscv/mm/pageattr.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -299,7 +299,7 @@ static int __set_memory(unsigned long addr, int numpages, pgprot_t set_mask,
299299
if (ret)
300300
goto unlock;
301301

302-
ret = walk_page_range_novma(&init_mm, lm_start, lm_end,
302+
ret = walk_kernel_page_table_range(lm_start, lm_end,
303303
&pageattr_ops, NULL, &masks);
304304
if (ret)
305305
goto unlock;
@@ -317,13 +317,13 @@ static int __set_memory(unsigned long addr, int numpages, pgprot_t set_mask,
317317
if (ret)
318318
goto unlock;
319319

320-
ret = walk_page_range_novma(&init_mm, lm_start, lm_end,
320+
ret = walk_kernel_page_table_range(lm_start, lm_end,
321321
&pageattr_ops, NULL, &masks);
322322
if (ret)
323323
goto unlock;
324324
}
325325

326-
ret = walk_page_range_novma(&init_mm, start, end, &pageattr_ops, NULL,
326+
ret = walk_kernel_page_table_range(start, end, &pageattr_ops, NULL,
327327
&masks);
328328

329329
unlock:
@@ -335,7 +335,7 @@ static int __set_memory(unsigned long addr, int numpages, pgprot_t set_mask,
335335
*/
336336
flush_tlb_all();
337337
#else
338-
ret = walk_page_range_novma(&init_mm, start, end, &pageattr_ops, NULL,
338+
ret = walk_kernel_page_table_range(start, end, &pageattr_ops, NULL,
339339
&masks);
340340

341341
mmap_write_unlock(&init_mm);

include/linux/pagewalk.h

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -129,10 +129,9 @@ struct mm_walk {
129129
int walk_page_range(struct mm_struct *mm, unsigned long start,
130130
unsigned long end, const struct mm_walk_ops *ops,
131131
void *private);
132-
int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
133-
unsigned long end, const struct mm_walk_ops *ops,
134-
pgd_t *pgd,
135-
void *private);
132+
int walk_kernel_page_table_range(unsigned long start,
133+
unsigned long end, const struct mm_walk_ops *ops,
134+
pgd_t *pgd, void *private);
136135
int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
137136
unsigned long end, const struct mm_walk_ops *ops,
138137
void *private);

mm/hugetlb_vmemmap.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end,
166166
VM_BUG_ON(!PAGE_ALIGNED(start | end));
167167

168168
mmap_read_lock(&init_mm);
169-
ret = walk_page_range_novma(&init_mm, start, end, &vmemmap_remap_ops,
169+
ret = walk_kernel_page_table_range(start, end, &vmemmap_remap_ops,
170170
NULL, walk);
171171
mmap_read_unlock(&init_mm);
172172
if (ret)

mm/internal.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1604,6 +1604,9 @@ static inline void accept_page(struct page *page)
16041604
int walk_page_range_mm(struct mm_struct *mm, unsigned long start,
16051605
unsigned long end, const struct mm_walk_ops *ops,
16061606
void *private);
1607+
int walk_page_range_debug(struct mm_struct *mm, unsigned long start,
1608+
unsigned long end, const struct mm_walk_ops *ops,
1609+
pgd_t *pgd, void *private);
16071610

16081611
/* pt_reclaim.c */
16091612
bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval);

mm/pagewalk.c

Lines changed: 55 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -585,8 +585,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
585585
}
586586

587587
/**
588-
* walk_page_range_novma - walk a range of pagetables not backed by a vma
589-
* @mm: mm_struct representing the target process of page table walk
588+
* walk_kernel_page_table_range - walk a range of kernel pagetables.
590589
* @start: start address of the virtual address range
591590
* @end: end address of the virtual address range
592591
* @ops: operation to call during the walk
@@ -596,17 +595,61 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
596595
* Similar to walk_page_range() but can walk any page tables even if they are
597596
* not backed by VMAs. Because 'unusual' entries may be walked this function
598597
* will also not lock the PTEs for the pte_entry() callback. This is useful for
599-
* walking the kernel pages tables or page tables for firmware.
598+
* walking kernel pages tables or page tables for firmware.
600599
*
601600
* Note: Be careful to walk the kernel pages tables, the caller may be need to
602601
* take other effective approaches (mmap lock may be insufficient) to prevent
603602
* the intermediate kernel page tables belonging to the specified address range
604603
* from being freed (e.g. memory hot-remove).
605604
*/
606-
int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
605+
int walk_kernel_page_table_range(unsigned long start, unsigned long end,
606+
const struct mm_walk_ops *ops, pgd_t *pgd, void *private)
607+
{
608+
struct mm_struct *mm = &init_mm;
609+
struct mm_walk walk = {
610+
.ops = ops,
611+
.mm = mm,
612+
.pgd = pgd,
613+
.private = private,
614+
.no_vma = true
615+
};
616+
617+
if (start >= end)
618+
return -EINVAL;
619+
if (!check_ops_valid(ops))
620+
return -EINVAL;
621+
622+
/*
623+
* Kernel intermediate page tables are usually not freed, so the mmap
624+
* read lock is sufficient. But there are some exceptions.
625+
* E.g. memory hot-remove. In which case, the mmap lock is insufficient
626+
* to prevent the intermediate kernel pages tables belonging to the
627+
* specified address range from being freed. The caller should take
628+
* other actions to prevent this race.
629+
*/
630+
mmap_assert_locked(mm);
631+
632+
return walk_pgd_range(start, end, &walk);
633+
}
634+
635+
/**
636+
* walk_page_range_debug - walk a range of pagetables not backed by a vma
637+
* @mm: mm_struct representing the target process of page table walk
638+
* @start: start address of the virtual address range
639+
* @end: end address of the virtual address range
640+
* @ops: operation to call during the walk
641+
* @pgd: pgd to walk if different from mm->pgd
642+
* @private: private data for callbacks' usage
643+
*
644+
* Similar to walk_page_range() but can walk any page tables even if they are
645+
* not backed by VMAs. Because 'unusual' entries may be walked this function
646+
* will also not lock the PTEs for the pte_entry() callback.
647+
*
648+
* This is for debugging purposes ONLY.
649+
*/
650+
int walk_page_range_debug(struct mm_struct *mm, unsigned long start,
607651
unsigned long end, const struct mm_walk_ops *ops,
608-
pgd_t *pgd,
609-
void *private)
652+
pgd_t *pgd, void *private)
610653
{
611654
struct mm_walk walk = {
612655
.ops = ops,
@@ -616,34 +659,24 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
616659
.no_vma = true
617660
};
618661

662+
/* For convenience, we allow traversal of kernel mappings. */
663+
if (mm == &init_mm)
664+
return walk_kernel_page_table_range(start, end, ops,
665+
pgd, private);
619666
if (start >= end || !walk.mm)
620667
return -EINVAL;
621668
if (!check_ops_valid(ops))
622669
return -EINVAL;
623670

624671
/*
625-
* 1) For walking the user virtual address space:
626-
*
627672
* The mmap lock protects the page walker from changes to the page
628673
* tables during the walk. However a read lock is insufficient to
629674
* protect those areas which don't have a VMA as munmap() detaches
630675
* the VMAs before downgrading to a read lock and actually tearing
631676
* down PTEs/page tables. In which case, the mmap write lock should
632-
* be hold.
633-
*
634-
* 2) For walking the kernel virtual address space:
635-
*
636-
* The kernel intermediate page tables usually do not be freed, so
637-
* the mmap map read lock is sufficient. But there are some exceptions.
638-
* E.g. memory hot-remove. In which case, the mmap lock is insufficient
639-
* to prevent the intermediate kernel pages tables belonging to the
640-
* specified address range from being freed. The caller should take
641-
* other actions to prevent this race.
677+
* be held.
642678
*/
643-
if (mm == &init_mm)
644-
mmap_assert_locked(walk.mm);
645-
else
646-
mmap_assert_write_locked(walk.mm);
679+
mmap_assert_write_locked(mm);
647680

648681
return walk_pgd_range(start, end, &walk);
649682
}

mm/ptdump.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include <linux/debugfs.h>
55
#include <linux/ptdump.h>
66
#include <linux/kasan.h>
7+
#include "internal.h"
78

89
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
910
/*
@@ -177,7 +178,7 @@ void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd)
177178

178179
mmap_write_lock(mm);
179180
while (range->start != range->end) {
180-
walk_page_range_novma(mm, range->start, range->end,
181+
walk_page_range_debug(mm, range->start, range->end,
181182
&ptdump_ops, pgd, st);
182183
range++;
183184
}

0 commit comments

Comments
 (0)