Skip to content

Commit 9c9e4aa

Browse files
committed
mm: hugetlb: independent PMD page table shared count
jira VULN-46929 cve CVE-2024-57883 commit-author Liu Shixin <liushixin2@huawei.com> commit 59d9094 upstream-diff Stable 5.15 backport 8410996eb6fea116fe1483ed977aacf580eee7b4 was used for the actual (clean) cherry-pick. Additionally the `atomic_t pt_share_count' field in `include/linux/mm_types.h' was wrapped in RH_KABI_BROKEN_INSERT macro to avoid kABI checker complains. It's justified, because the inserted field (it's included, as CONFIG_ARCH_WANT_HUGE_PMD_SHARE gets enabled for at least `kernel-x86_64-rhel.config') is placed within a union which already contained a field of the same type `atomic_t pt_frag_refcount', so the size of it cannot change. The folio refcount may be increased unexpectly through try_get_folio() by caller such as split_huge_pages. In huge_pmd_unshare(), we use refcount to check whether a pmd page table is shared. The check is incorrect if the refcount is increased by the above caller, and this can cause the page table leaked: BUG: Bad page state in process sh pfn:109324 page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x66 pfn:0x109324 flags: 0x17ffff800000000(node=0|zone=2|lastcpupid=0xfffff) page_type: f2(table) raw: 017ffff800000000 0000000000000000 0000000000000000 0000000000000000 raw: 0000000000000066 0000000000000000 00000000f2000000 0000000000000000 page dumped because: nonzero mapcount ... CPU: 31 UID: 0 PID: 7515 Comm: sh Kdump: loaded Tainted: G B 6.13.0-rc2master+ #7 Tainted: [B]=BAD_PAGE Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 Call trace: show_stack+0x20/0x38 (C) dump_stack_lvl+0x80/0xf8 dump_stack+0x18/0x28 bad_page+0x8c/0x130 free_page_is_bad_report+0xa4/0xb0 free_unref_page+0x3cc/0x620 __folio_put+0xf4/0x158 split_huge_pages_all+0x1e0/0x3e8 split_huge_pages_write+0x25c/0x2d8 full_proxy_write+0x64/0xd8 vfs_write+0xcc/0x280 ksys_write+0x70/0x110 __arm64_sys_write+0x24/0x38 invoke_syscall+0x50/0x120 el0_svc_common.constprop.0+0xc8/0xf0 do_el0_svc+0x24/0x38 el0_svc+0x34/0x128 el0t_64_sync_handler+0xc8/0xd0 el0t_64_sync+0x190/0x198 The issue may be triggered by damon, offline_page, page_idle, etc, which will increase the refcount of page table. 1. The page table itself will be discarded after reporting the "nonzero mapcount". 2. The HugeTLB page mapped by the page table miss freeing since we treat the page table as shared and a shared page table will not be unmapped. Fix it by introducing independent PMD page table shared count. As described by comment, pt_index/pt_mm/pt_frag_refcount are used for s390 gmap, x86 pgds and powerpc, pt_share_count is used for x86/arm64/riscv pmds, so we can reuse the field as pt_share_count. Link: https://lkml.kernel.org/r/20241216071147.3984217-1-liushixin2@huawei.com Fixes: 39dde65 ("[PATCH] shared page table for hugetlb page") Signed-off-by: Liu Shixin <liushixin2@huawei.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Ken Chen <kenneth.w.chen@intel.com> Cc: Muchun Song <muchun.song@linux.dev> Cc: Nanyong Sun <sunnanyong@huawei.com> Cc: Jane Chu <jane.chu@oracle.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> (cherry picked from commit 59d9094) Signed-off-by: Marcin Wcisło <marcin.wcislo@conclusive.pl>
1 parent d23c840 commit 9c9e4aa

File tree

3 files changed

+14
-10
lines changed

3 files changed

+14
-10
lines changed

include/linux/mm.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2436,6 +2436,9 @@ static inline bool pgtable_pmd_page_ctor(struct page *page)
24362436
if (!pmd_ptlock_init(page))
24372437
return false;
24382438
__SetPageTable(page);
2439+
#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
2440+
atomic_set(&page->pt_share_count, 0);
2441+
#endif
24392442
inc_lruvec_page_state(page, NR_PAGETABLE);
24402443
return true;
24412444
}

include/linux/mm_types.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,9 @@ struct page {
155155
union {
156156
struct mm_struct *pt_mm; /* x86 pgds only */
157157
atomic_t pt_frag_refcount; /* powerpc */
158+
#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
159+
RH_KABI_BROKEN_INSERT(atomic_t pt_share_count)
160+
#endif
158161
};
159162
#if ALLOC_SPLIT_PTLOCKS
160163
spinlock_t *ptl;

mm/hugetlb.c

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6718,7 +6718,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
67186718
spte = huge_pte_offset(svma->vm_mm, saddr,
67196719
vma_mmu_pagesize(svma));
67206720
if (spte) {
6721-
get_page(virt_to_page(spte));
6721+
atomic_inc(&virt_to_page(spte)->pt_share_count);
67226722
break;
67236723
}
67246724
}
@@ -6733,7 +6733,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
67336733
(pmd_t *)((unsigned long)spte & PAGE_MASK));
67346734
mm_inc_nr_pmds(mm);
67356735
} else {
6736-
put_page(virt_to_page(spte));
6736+
atomic_dec(&virt_to_page(spte)->pt_share_count);
67376737
}
67386738
spin_unlock(ptl);
67396739
out:
@@ -6744,29 +6744,27 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
67446744
/*
67456745
* unmap huge page backed by shared pte.
67466746
*
6747-
* Hugetlb pte page is ref counted at the time of mapping. If pte is shared
6748-
* indicated by page_count > 1, unmap is achieved by clearing pud and
6749-
* decrementing the ref count. If count == 1, the pte page is not shared.
6750-
*
6751-
* Called with page table lock held and i_mmap_rwsem held in write mode.
6747+
* Called with page table lock held.
67526748
*
67536749
* returns: 1 successfully unmapped a shared pte page
67546750
* 0 the underlying pte page is not shared, or it is the last user
67556751
*/
67566752
int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
67576753
unsigned long *addr, pte_t *ptep)
67586754
{
6755+
unsigned long sz = huge_page_size(hstate_vma(vma));
67596756
pgd_t *pgd = pgd_offset(mm, *addr);
67606757
p4d_t *p4d = p4d_offset(pgd, *addr);
67616758
pud_t *pud = pud_offset(p4d, *addr);
67626759

67636760
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
6764-
BUG_ON(page_count(virt_to_page(ptep)) == 0);
6765-
if (page_count(virt_to_page(ptep)) == 1)
6761+
if (sz != PMD_SIZE)
6762+
return 0;
6763+
if (!atomic_read(&virt_to_page(ptep)->pt_share_count))
67666764
return 0;
67676765

67686766
pud_clear(pud);
6769-
put_page(virt_to_page(ptep));
6767+
atomic_dec(&virt_to_page(ptep)->pt_share_count);
67706768
mm_dec_nr_pmds(mm);
67716769
/*
67726770
* This update of passed address optimizes loops sequentially

0 commit comments

Comments
 (0)