Skip to content

Commit 5e31275

Browse files
surenbaghdasaryanakpm00
authored andcommitted
mm: add per-VMA lock and helper functions to control it
Introduce per-VMA locking. The lock implementation relies on a per-vma and per-mm sequence counters to note exclusive locking: - read lock - (implemented by vma_start_read) requires the vma (vm_lock_seq) and mm (mm_lock_seq) sequence counters to differ. If they match then there must be a vma exclusive lock held somewhere. - read unlock - (implemented by vma_end_read) is a trivial vma->lock unlock. - write lock - (vma_start_write) requires the mmap_lock to be held exclusively and the current mm counter is assigned to the vma counter. This will allow multiple vmas to be locked under a single mmap_lock write lock (e.g. during vma merging). The vma counter is modified under exclusive vma lock. - write unlock - (vma_end_write_all) is a batch release of all vma locks held. It doesn't pair with a specific vma_start_write! It is done before exclusive mmap_lock is released by incrementing mm sequence counter (mm_lock_seq). - write downgrade - if the mmap_lock is downgraded to the read lock, all vma write locks are released as well (effectivelly same as write unlock). Link: https://lkml.kernel.org/r/20230227173632.3292573-13-surenb@google.com Signed-off-by: Suren Baghdasaryan <surenb@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
1 parent 438b6e1 commit 5e31275

File tree

5 files changed

+110
-0
lines changed

5 files changed

+110
-0
lines changed

include/linux/mm.h

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -624,6 +624,87 @@ struct vm_operations_struct {
624624
unsigned long addr);
625625
};
626626

627+
#ifdef CONFIG_PER_VMA_LOCK
628+
static inline void vma_init_lock(struct vm_area_struct *vma)
629+
{
630+
init_rwsem(&vma->lock);
631+
vma->vm_lock_seq = -1;
632+
}
633+
634+
/*
635+
* Try to read-lock a vma. The function is allowed to occasionally yield false
636+
* locked result to avoid performance overhead, in which case we fall back to
637+
* using mmap_lock. The function should never yield false unlocked result.
638+
*/
639+
static inline bool vma_start_read(struct vm_area_struct *vma)
640+
{
641+
/* Check before locking. A race might cause false locked result. */
642+
if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
643+
return false;
644+
645+
if (unlikely(down_read_trylock(&vma->lock) == 0))
646+
return false;
647+
648+
/*
649+
* Overflow might produce false locked result.
650+
* False unlocked result is impossible because we modify and check
651+
* vma->vm_lock_seq under vma->lock protection and mm->mm_lock_seq
652+
* modification invalidates all existing locks.
653+
*/
654+
if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) {
655+
up_read(&vma->lock);
656+
return false;
657+
}
658+
return true;
659+
}
660+
661+
static inline void vma_end_read(struct vm_area_struct *vma)
662+
{
663+
rcu_read_lock(); /* keeps vma alive till the end of up_read */
664+
up_read(&vma->lock);
665+
rcu_read_unlock();
666+
}
667+
668+
static inline void vma_start_write(struct vm_area_struct *vma)
669+
{
670+
int mm_lock_seq;
671+
672+
mmap_assert_write_locked(vma->vm_mm);
673+
674+
/*
675+
* current task is holding mmap_write_lock, both vma->vm_lock_seq and
676+
* mm->mm_lock_seq can't be concurrently modified.
677+
*/
678+
mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
679+
if (vma->vm_lock_seq == mm_lock_seq)
680+
return;
681+
682+
down_write(&vma->lock);
683+
vma->vm_lock_seq = mm_lock_seq;
684+
up_write(&vma->lock);
685+
}
686+
687+
static inline void vma_assert_write_locked(struct vm_area_struct *vma)
688+
{
689+
mmap_assert_write_locked(vma->vm_mm);
690+
/*
691+
* current task is holding mmap_write_lock, both vma->vm_lock_seq and
692+
* mm->mm_lock_seq can't be concurrently modified.
693+
*/
694+
VM_BUG_ON_VMA(vma->vm_lock_seq != READ_ONCE(vma->vm_mm->mm_lock_seq), vma);
695+
}
696+
697+
#else /* CONFIG_PER_VMA_LOCK */
698+
699+
static inline void vma_init_lock(struct vm_area_struct *vma) {}
700+
static inline bool vma_start_read(struct vm_area_struct *vma)
701+
{ return false; }
702+
static inline void vma_end_read(struct vm_area_struct *vma) {}
703+
static inline void vma_start_write(struct vm_area_struct *vma) {}
704+
static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}
705+
706+
#endif /* CONFIG_PER_VMA_LOCK */
707+
627708
static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
628709
{
629710
static const struct vm_operations_struct dummy_vm_ops = {};
@@ -632,6 +713,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
632713
vma->vm_mm = mm;
633714
vma->vm_ops = &dummy_vm_ops;
634715
INIT_LIST_HEAD(&vma->anon_vma_chain);
716+
vma_init_lock(vma);
635717
}
636718

637719
/* Use when VMA is not part of the VMA tree and needs no locking */

include/linux/mm_types.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -503,6 +503,11 @@ struct vm_area_struct {
503503
vm_flags_t __private __vm_flags;
504504
};
505505

506+
#ifdef CONFIG_PER_VMA_LOCK
507+
int vm_lock_seq;
508+
struct rw_semaphore lock;
509+
#endif
510+
506511
/*
507512
* For areas with an address space and backing store,
508513
* linkage into the address_space->i_mmap interval tree.
@@ -639,6 +644,9 @@ struct mm_struct {
639644
* init_mm.mmlist, and are protected
640645
* by mmlist_lock
641646
*/
647+
#ifdef CONFIG_PER_VMA_LOCK
648+
int mm_lock_seq;
649+
#endif
642650

643651

644652
unsigned long hiwater_rss; /* High-watermark of RSS usage */

include/linux/mmap_lock.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,17 @@ static inline void mmap_assert_write_locked(struct mm_struct *mm)
7272
VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
7373
}
7474

75+
#ifdef CONFIG_PER_VMA_LOCK
76+
static inline void vma_end_write_all(struct mm_struct *mm)
77+
{
78+
mmap_assert_write_locked(mm);
79+
/* No races during update due to exclusive mmap_lock being held */
80+
WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1);
81+
}
82+
#else
83+
static inline void vma_end_write_all(struct mm_struct *mm) {}
84+
#endif
85+
7586
static inline void mmap_init_lock(struct mm_struct *mm)
7687
{
7788
init_rwsem(&mm->mmap_lock);
@@ -114,12 +125,14 @@ static inline bool mmap_write_trylock(struct mm_struct *mm)
114125
static inline void mmap_write_unlock(struct mm_struct *mm)
115126
{
116127
__mmap_lock_trace_released(mm, true);
128+
vma_end_write_all(mm);
117129
up_write(&mm->mmap_lock);
118130
}
119131

120132
static inline void mmap_write_downgrade(struct mm_struct *mm)
121133
{
122134
__mmap_lock_trace_acquire_returned(mm, false, true);
135+
vma_end_write_all(mm);
123136
downgrade_write(&mm->mmap_lock);
124137
}
125138

kernel/fork.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -474,6 +474,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
474474
*/
475475
data_race(memcpy(new, orig, sizeof(*new)));
476476
INIT_LIST_HEAD(&new->anon_vma_chain);
477+
vma_init_lock(new);
477478
dup_anon_vma_name(orig, new);
478479
}
479480
return new;
@@ -1208,6 +1209,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
12081209
seqcount_init(&mm->write_protect_seq);
12091210
mmap_init_lock(mm);
12101211
INIT_LIST_HEAD(&mm->mmlist);
1212+
#ifdef CONFIG_PER_VMA_LOCK
1213+
mm->mm_lock_seq = 0;
1214+
#endif
12111215
mm_pgtables_bytes_init(mm);
12121216
mm->map_count = 0;
12131217
mm->locked_vm = 0;

mm/init-mm.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@ struct mm_struct init_mm = {
3737
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
3838
.arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
3939
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
40+
#ifdef CONFIG_PER_VMA_LOCK
41+
.mm_lock_seq = 0,
42+
#endif
4043
.user_ns = &init_user_ns,
4144
.cpu_bitmap = CPU_BITS_NONE,
4245
#ifdef CONFIG_IOMMU_SVA

0 commit comments

Comments
 (0)