Skip to content

Commit 40549ba

Browse files
mjkravetzakpm00
authored andcommitted
hugetlb: use new vma_lock for pmd sharing synchronization
The new hugetlb vma lock is used to address this race: Faulting thread Unsharing thread ... ... ptep = huge_pte_offset() or ptep = huge_pte_alloc() ... i_mmap_lock_write lock page table ptep invalid <------------------------ huge_pmd_unshare() Could be in a previously unlock_page_table sharing process or worse i_mmap_unlock_write ... The vma_lock is used as follows: - During fault processing. The lock is acquired in read mode before doing a page table lock and allocation (huge_pte_alloc). The lock is held until code is finished with the page table entry (ptep). - The lock must be held in write mode whenever huge_pmd_unshare is called. Lock ordering issues come into play when unmapping a page from all vmas mapping the page. The i_mmap_rwsem must be held to search for the vmas, and the vma lock must be held before calling unmap which will call huge_pmd_unshare. This is done today in: - try_to_migrate_one and try_to_unmap_ for page migration and memory error handling. In these routines we 'try' to obtain the vma lock and fail to unmap if unsuccessful. Calling routines already deal with the failure of unmapping. - hugetlb_vmdelete_list for truncation and hole punch. This routine also tries to acquire the vma lock. If it fails, it skips the unmapping. However, we can not have file truncation or hole punch fail because of contention. After hugetlb_vmdelete_list, truncation and hole punch call remove_inode_hugepages. remove_inode_hugepages checks for mapped pages and call hugetlb_unmap_file_page to unmap them. hugetlb_unmap_file_page is designed to drop locks and reacquire in the correct order to guarantee unmap success. Link: https://lkml.kernel.org/r/20220914221810.95771-9-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> Cc: Axel Rasmussen <axelrasmussen@google.com> Cc: David Hildenbrand <david@redhat.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: James Houghton <jthoughton@google.com> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Cc: Miaohe Lin <linmiaohe@huawei.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Mina Almasry <almasrymina@google.com> Cc: Muchun Song <songmuchun@bytedance.com> Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev> Cc: Pasha Tatashin <pasha.tatashin@soleen.com> Cc: Peter Xu <peterx@redhat.com> Cc: Prakash Sangappa <prakash.sangappa@oracle.com> Cc: Sven Schnelle <svens@linux.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
1 parent 378397c commit 40549ba

File tree

5 files changed

+233
-46
lines changed

5 files changed

+233
-46
lines changed

fs/hugetlbfs/inode.c

Lines changed: 65 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -434,6 +434,7 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
434434
struct folio *folio, pgoff_t index)
435435
{
436436
struct rb_root_cached *root = &mapping->i_mmap;
437+
struct hugetlb_vma_lock *vma_lock;
437438
struct page *page = &folio->page;
438439
struct vm_area_struct *vma;
439440
unsigned long v_start;
@@ -444,19 +445,72 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
444445
end = (index + 1) * pages_per_huge_page(h);
445446

446447
i_mmap_lock_write(mapping);
447-
448+
retry:
449+
vma_lock = NULL;
448450
vma_interval_tree_foreach(vma, root, start, end - 1) {
449451
v_start = vma_offset_start(vma, start);
450452
v_end = vma_offset_end(vma, end);
451453

452454
if (!hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page))
453455
continue;
454456

457+
if (!hugetlb_vma_trylock_write(vma)) {
458+
vma_lock = vma->vm_private_data;
459+
/*
460+
* If we can not get vma lock, we need to drop
461+
* immap_sema and take locks in order. First,
462+
* take a ref on the vma_lock structure so that
463+
* we can be guaranteed it will not go away when
464+
* dropping immap_sema.
465+
*/
466+
kref_get(&vma_lock->refs);
467+
break;
468+
}
469+
455470
unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
456471
NULL, ZAP_FLAG_DROP_MARKER);
472+
hugetlb_vma_unlock_write(vma);
457473
}
458474

459475
i_mmap_unlock_write(mapping);
476+
477+
if (vma_lock) {
478+
/*
479+
* Wait on vma_lock. We know it is still valid as we have
480+
* a reference. We must 'open code' vma locking as we do
481+
* not know if vma_lock is still attached to vma.
482+
*/
483+
down_write(&vma_lock->rw_sema);
484+
i_mmap_lock_write(mapping);
485+
486+
vma = vma_lock->vma;
487+
if (!vma) {
488+
/*
489+
* If lock is no longer attached to vma, then just
490+
* unlock, drop our reference and retry looking for
491+
* other vmas.
492+
*/
493+
up_write(&vma_lock->rw_sema);
494+
kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
495+
goto retry;
496+
}
497+
498+
/*
499+
* vma_lock is still attached to vma. Check to see if vma
500+
* still maps page and if so, unmap.
501+
*/
502+
v_start = vma_offset_start(vma, start);
503+
v_end = vma_offset_end(vma, end);
504+
if (hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page))
505+
unmap_hugepage_range(vma, vma->vm_start + v_start,
506+
v_end, NULL,
507+
ZAP_FLAG_DROP_MARKER);
508+
509+
kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
510+
hugetlb_vma_unlock_write(vma);
511+
512+
goto retry;
513+
}
460514
}
461515

462516
static void
@@ -474,11 +528,21 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
474528
unsigned long v_start;
475529
unsigned long v_end;
476530

531+
if (!hugetlb_vma_trylock_write(vma))
532+
continue;
533+
477534
v_start = vma_offset_start(vma, start);
478535
v_end = vma_offset_end(vma, end);
479536

480537
unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
481538
NULL, zap_flags);
539+
540+
/*
541+
* Note that vma lock only exists for shared/non-private
542+
* vmas. Therefore, lock is not held when calling
543+
* unmap_hugepage_range for private vmas.
544+
*/
545+
hugetlb_vma_unlock_write(vma);
482546
}
483547
}
484548

mm/hugetlb.c

Lines changed: 93 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4796,6 +4796,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
47964796
mmu_notifier_invalidate_range_start(&range);
47974797
mmap_assert_write_locked(src);
47984798
raw_write_seqcount_begin(&src->write_protect_seq);
4799+
} else {
4800+
/*
4801+
* For shared mappings the vma lock must be held before
4802+
* calling huge_pte_offset in the src vma. Otherwise, the
4803+
* returned ptep could go away if part of a shared pmd and
4804+
* another thread calls huge_pmd_unshare.
4805+
*/
4806+
hugetlb_vma_lock_read(src_vma);
47994807
}
48004808

48014809
last_addr_mask = hugetlb_mask_last_page(h);
@@ -4942,6 +4950,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
49424950
if (cow) {
49434951
raw_write_seqcount_end(&src->write_protect_seq);
49444952
mmu_notifier_invalidate_range_end(&range);
4953+
} else {
4954+
hugetlb_vma_unlock_read(src_vma);
49454955
}
49464956

49474957
return ret;
@@ -5000,6 +5010,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
50005010
mmu_notifier_invalidate_range_start(&range);
50015011
last_addr_mask = hugetlb_mask_last_page(h);
50025012
/* Prevent race with file truncation */
5013+
hugetlb_vma_lock_write(vma);
50035014
i_mmap_lock_write(mapping);
50045015
for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
50055016
src_pte = huge_pte_offset(mm, old_addr, sz);
@@ -5031,6 +5042,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
50315042
flush_tlb_range(vma, old_end - len, old_end);
50325043
mmu_notifier_invalidate_range_end(&range);
50335044
i_mmap_unlock_write(mapping);
5045+
hugetlb_vma_unlock_write(vma);
50345046

50355047
return len + old_addr - old_end;
50365048
}
@@ -5350,8 +5362,29 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
53505362
* may get SIGKILLed if it later faults.
53515363
*/
53525364
if (outside_reserve) {
5365+
struct address_space *mapping = vma->vm_file->f_mapping;
5366+
pgoff_t idx;
5367+
u32 hash;
5368+
53535369
put_page(old_page);
5370+
/*
5371+
* Drop hugetlb_fault_mutex and vma_lock before
5372+
* unmapping. unmapping needs to hold vma_lock
5373+
* in write mode. Dropping vma_lock in read mode
5374+
* here is OK as COW mappings do not interact with
5375+
* PMD sharing.
5376+
*
5377+
* Reacquire both after unmap operation.
5378+
*/
5379+
idx = vma_hugecache_offset(h, vma, haddr);
5380+
hash = hugetlb_fault_mutex_hash(mapping, idx);
5381+
hugetlb_vma_unlock_read(vma);
5382+
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
5383+
53545384
unmap_ref_private(mm, vma, old_page, haddr);
5385+
5386+
mutex_lock(&hugetlb_fault_mutex_table[hash]);
5387+
hugetlb_vma_lock_read(vma);
53555388
spin_lock(ptl);
53565389
ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
53575390
if (likely(ptep &&
@@ -5500,14 +5533,16 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
55005533
};
55015534

55025535
/*
5503-
* hugetlb_fault_mutex and i_mmap_rwsem must be
5536+
* vma_lock and hugetlb_fault_mutex must be
55045537
* dropped before handling userfault. Reacquire
55055538
* after handling fault to make calling code simpler.
55065539
*/
5540+
hugetlb_vma_unlock_read(vma);
55075541
hash = hugetlb_fault_mutex_hash(mapping, idx);
55085542
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
55095543
ret = handle_userfault(&vmf, reason);
55105544
mutex_lock(&hugetlb_fault_mutex_table[hash]);
5545+
hugetlb_vma_lock_read(vma);
55115546

55125547
return ret;
55135548
}
@@ -5741,30 +5776,47 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
57415776

57425777
ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
57435778
if (ptep) {
5779+
/*
5780+
* Since we hold no locks, ptep could be stale. That is
5781+
* OK as we are only making decisions based on content and
5782+
* not actually modifying content here.
5783+
*/
57445784
entry = huge_ptep_get(ptep);
57455785
if (unlikely(is_hugetlb_entry_migration(entry))) {
57465786
migration_entry_wait_huge(vma, ptep);
57475787
return 0;
57485788
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
57495789
return VM_FAULT_HWPOISON_LARGE |
57505790
VM_FAULT_SET_HINDEX(hstate_index(h));
5751-
} else {
5752-
ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
5753-
if (!ptep)
5754-
return VM_FAULT_OOM;
57555791
}
57565792

5757-
mapping = vma->vm_file->f_mapping;
5758-
idx = vma_hugecache_offset(h, vma, haddr);
5759-
57605793
/*
57615794
* Serialize hugepage allocation and instantiation, so that we don't
57625795
* get spurious allocation failures if two CPUs race to instantiate
57635796
* the same page in the page cache.
57645797
*/
5798+
mapping = vma->vm_file->f_mapping;
5799+
idx = vma_hugecache_offset(h, vma, haddr);
57655800
hash = hugetlb_fault_mutex_hash(mapping, idx);
57665801
mutex_lock(&hugetlb_fault_mutex_table[hash]);
57675802

5803+
/*
5804+
* Acquire vma lock before calling huge_pte_alloc and hold
5805+
* until finished with ptep. This prevents huge_pmd_unshare from
5806+
* being called elsewhere and making the ptep no longer valid.
5807+
*
5808+
* ptep could have already be assigned via huge_pte_offset. That
5809+
* is OK, as huge_pte_alloc will return the same value unless
5810+
* something has changed.
5811+
*/
5812+
hugetlb_vma_lock_read(vma);
5813+
ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
5814+
if (!ptep) {
5815+
hugetlb_vma_unlock_read(vma);
5816+
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
5817+
return VM_FAULT_OOM;
5818+
}
5819+
57685820
entry = huge_ptep_get(ptep);
57695821
/* PTE markers should be handled the same way as none pte */
57705822
if (huge_pte_none_mostly(entry)) {
@@ -5825,6 +5877,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
58255877
unlock_page(pagecache_page);
58265878
put_page(pagecache_page);
58275879
}
5880+
hugetlb_vma_unlock_read(vma);
58285881
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
58295882
return handle_userfault(&vmf, VM_UFFD_WP);
58305883
}
@@ -5868,6 +5921,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
58685921
put_page(pagecache_page);
58695922
}
58705923
out_mutex:
5924+
hugetlb_vma_unlock_read(vma);
58715925
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
58725926
/*
58735927
* Generally it's safe to hold refcount during waiting page lock. But
@@ -6330,8 +6384,9 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
63306384
flush_cache_range(vma, range.start, range.end);
63316385

63326386
mmu_notifier_invalidate_range_start(&range);
6333-
last_addr_mask = hugetlb_mask_last_page(h);
6387+
hugetlb_vma_lock_write(vma);
63346388
i_mmap_lock_write(vma->vm_file->f_mapping);
6389+
last_addr_mask = hugetlb_mask_last_page(h);
63356390
for (; address < end; address += psize) {
63366391
spinlock_t *ptl;
63376392
ptep = huge_pte_offset(mm, address, psize);
@@ -6430,6 +6485,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
64306485
* See Documentation/mm/mmu_notifier.rst
64316486
*/
64326487
i_mmap_unlock_write(vma->vm_file->f_mapping);
6488+
hugetlb_vma_unlock_write(vma);
64336489
mmu_notifier_invalidate_range_end(&range);
64346490

64356491
return pages << h->order;
@@ -6931,6 +6987,7 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
69316987
pud_t *pud = pud_offset(p4d, addr);
69326988

69336989
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
6990+
hugetlb_vma_assert_locked(vma);
69346991
BUG_ON(page_count(virt_to_page(ptep)) == 0);
69356992
if (page_count(virt_to_page(ptep)) == 1)
69366993
return 0;
@@ -6943,6 +7000,31 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
69437000

69447001
#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
69457002

7003+
void hugetlb_vma_lock_read(struct vm_area_struct *vma)
7004+
{
7005+
}
7006+
7007+
void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
7008+
{
7009+
}
7010+
7011+
void hugetlb_vma_lock_write(struct vm_area_struct *vma)
7012+
{
7013+
}
7014+
7015+
void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
7016+
{
7017+
}
7018+
7019+
int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
7020+
{
7021+
return 1;
7022+
}
7023+
7024+
void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
7025+
{
7026+
}
7027+
69467028
void hugetlb_vma_lock_release(struct kref *kref)
69477029
{
69487030
}
@@ -7325,6 +7407,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
73257407
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
73267408
start, end);
73277409
mmu_notifier_invalidate_range_start(&range);
7410+
hugetlb_vma_lock_write(vma);
73287411
i_mmap_lock_write(vma->vm_file->f_mapping);
73297412
for (address = start; address < end; address += PUD_SIZE) {
73307413
ptep = huge_pte_offset(mm, address, sz);
@@ -7336,6 +7419,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
73367419
}
73377420
flush_hugetlb_tlb_range(vma, start, end);
73387421
i_mmap_unlock_write(vma->vm_file->f_mapping);
7422+
hugetlb_vma_unlock_write(vma);
73397423
/*
73407424
* No need to call mmu_notifier_invalidate_range(), see
73417425
* Documentation/mm/mmu_notifier.rst.

mm/memory.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1684,10 +1684,12 @@ static void unmap_single_vma(struct mmu_gather *tlb,
16841684
if (vma->vm_file) {
16851685
zap_flags_t zap_flags = details ?
16861686
details->zap_flags : 0;
1687+
hugetlb_vma_lock_write(vma);
16871688
i_mmap_lock_write(vma->vm_file->f_mapping);
16881689
__unmap_hugepage_range_final(tlb, vma, start, end,
16891690
NULL, zap_flags);
16901691
i_mmap_unlock_write(vma->vm_file->f_mapping);
1692+
hugetlb_vma_unlock_write(vma);
16911693
}
16921694
} else
16931695
unmap_page_range(tlb, vma, start, end, details);

0 commit comments

Comments
 (0)