Skip to content

Commit d240629

Browse files
Peng Zhangakpm00
authored andcommitted
fork: use __mt_dup() to duplicate maple tree in dup_mmap()
In dup_mmap(), using __mt_dup() to duplicate the old maple tree and then directly replacing the entries of VMAs in the new maple tree can result in better performance. __mt_dup() uses DFS pre-order to duplicate the maple tree, so it is efficient. The average time complexity of __mt_dup() is O(n), where n is the number of VMAs. The proof of the time complexity is provided in the commit log that introduces __mt_dup(). After duplicating the maple tree, each element is traversed and replaced (ignoring the cases of deletion, which are rare). Since it is only a replacement operation for each element, this process is also O(n). Analyzing the exact time complexity of the previous algorithm is challenging because each insertion can involve appending to a node, pushing data to adjacent nodes, or even splitting nodes. The frequency of each action is difficult to calculate. The worst-case scenario for a single insertion is when the tree undergoes splitting at every level. If we consider each insertion as the worst-case scenario, we can determine that the upper bound of the time complexity is O(n*log(n)), although this is a loose upper bound. However, based on the test data, it appears that the actual time complexity is likely to be O(n). As the entire maple tree is duplicated using __mt_dup(), if dup_mmap() fails, there will be a portion of VMAs that have not been duplicated in the maple tree. To handle this, we mark the failure point with XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, stop releasing VMAs that have not been duplicated after this point. There is a "spawn" in byte-unixbench[1], which can be used to test the performance of fork(). I modified it slightly to make it work with different number of VMAs. Below are the test results. The first row shows the number of VMAs. The second and third rows show the number of fork() calls per ten seconds, corresponding to next-20231006 and the this patchset, respectively. The test results were obtained with CPU binding to avoid scheduler load balancing that could cause unstable results. There are still some fluctuations in the test results, but at least they are better than the original performance. 21 121 221 421 821 1621 3221 6421 12821 25621 51221 112100 76261 54227 34035 20195 11112 6017 3161 1606 802 393 114558 83067 65008 45824 28751 16072 8922 4747 2436 1233 599 2.19% 8.92% 19.88% 34.64% 42.37% 44.64% 48.28% 50.17% 51.68% 53.74% 52.42% [1] https://github.com/kdlucas/byte-unixbench/tree/master Link: https://lkml.kernel.org/r/20231027033845.90608-11-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang <zhangpeng.00@bytedance.com> Suggested-by: Liam R. Howlett <Liam.Howlett@oracle.com> Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com> Cc: Christian Brauner <brauner@kernel.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Mateusz Guzik <mjguzik@gmail.com> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Michael S. Tsirkin <mst@redhat.com> Cc: Mike Christie <michael.christie@oracle.com> Cc: Nicholas Piggin <npiggin@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Suren Baghdasaryan <surenb@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
1 parent 8e50d32 commit d240629

File tree

5 files changed

+52
-26
lines changed

5 files changed

+52
-26
lines changed

include/linux/mm.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -994,6 +994,17 @@ static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi,
994994
return mas_expected_entries(&vmi->mas, count);
995995
}
996996

997+
static inline int vma_iter_clear_gfp(struct vma_iterator *vmi,
998+
unsigned long start, unsigned long end, gfp_t gfp)
999+
{
1000+
__mas_set_range(&vmi->mas, start, end - 1);
1001+
mas_store_gfp(&vmi->mas, NULL, gfp);
1002+
if (unlikely(mas_is_err(&vmi->mas)))
1003+
return -ENOMEM;
1004+
1005+
return 0;
1006+
}
1007+
9971008
/* Free any unused preallocations */
9981009
static inline void vma_iter_free(struct vma_iterator *vmi)
9991010
{

kernel/fork.c

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -650,7 +650,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
650650
int retval;
651651
unsigned long charge = 0;
652652
LIST_HEAD(uf);
653-
VMA_ITERATOR(old_vmi, oldmm, 0);
654653
VMA_ITERATOR(vmi, mm, 0);
655654

656655
uprobe_start_dup_mmap();
@@ -678,16 +677,22 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
678677
goto out;
679678
khugepaged_fork(mm, oldmm);
680679

681-
retval = vma_iter_bulk_alloc(&vmi, oldmm->map_count);
682-
if (retval)
680+
/* Use __mt_dup() to efficiently build an identical maple tree. */
681+
retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
682+
if (unlikely(retval))
683683
goto out;
684684

685685
mt_clear_in_rcu(vmi.mas.tree);
686-
for_each_vma(old_vmi, mpnt) {
686+
for_each_vma(vmi, mpnt) {
687687
struct file *file;
688688

689689
vma_start_write(mpnt);
690690
if (mpnt->vm_flags & VM_DONTCOPY) {
691+
retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start,
692+
mpnt->vm_end, GFP_KERNEL);
693+
if (retval)
694+
goto loop_out;
695+
691696
vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
692697
continue;
693698
}
@@ -749,9 +754,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
749754
if (is_vm_hugetlb_page(tmp))
750755
hugetlb_dup_vma_private(tmp);
751756

752-
/* Link the vma into the MT */
753-
if (vma_iter_bulk_store(&vmi, tmp))
754-
goto fail_nomem_vmi_store;
757+
/*
758+
* Link the vma into the MT. After using __mt_dup(), memory
759+
* allocation is not necessary here, so it cannot fail.
760+
*/
761+
vma_iter_bulk_store(&vmi, tmp);
755762

756763
mm->map_count++;
757764
if (!(tmp->vm_flags & VM_WIPEONFORK))
@@ -760,15 +767,28 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
760767
if (tmp->vm_ops && tmp->vm_ops->open)
761768
tmp->vm_ops->open(tmp);
762769

763-
if (retval)
770+
if (retval) {
771+
mpnt = vma_next(&vmi);
764772
goto loop_out;
773+
}
765774
}
766775
/* a new mm has just been created */
767776
retval = arch_dup_mmap(oldmm, mm);
768777
loop_out:
769778
vma_iter_free(&vmi);
770-
if (!retval)
779+
if (!retval) {
771780
mt_set_in_rcu(vmi.mas.tree);
781+
} else if (mpnt) {
782+
/*
783+
* The entire maple tree has already been duplicated. If the
784+
* mmap duplication fails, mark the failure point with
785+
* XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
786+
* stop releasing VMAs that have not been duplicated after this
787+
* point.
788+
*/
789+
mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
790+
mas_store(&vmi.mas, XA_ZERO_ENTRY);
791+
}
772792
out:
773793
mmap_write_unlock(mm);
774794
flush_tlb_mm(oldmm);
@@ -778,8 +798,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
778798
uprobe_end_dup_mmap();
779799
return retval;
780800

781-
fail_nomem_vmi_store:
782-
unlink_anon_vmas(tmp);
783801
fail_nomem_anon_vma_fork:
784802
mpol_put(vma_policy(tmp));
785803
fail_nomem_policy:

mm/internal.h

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1154,17 +1154,6 @@ static inline void vma_iter_clear(struct vma_iterator *vmi)
11541154
mas_store_prealloc(&vmi->mas, NULL);
11551155
}
11561156

1157-
static inline int vma_iter_clear_gfp(struct vma_iterator *vmi,
1158-
unsigned long start, unsigned long end, gfp_t gfp)
1159-
{
1160-
__mas_set_range(&vmi->mas, start, end - 1);
1161-
mas_store_gfp(&vmi->mas, NULL, gfp);
1162-
if (unlikely(mas_is_err(&vmi->mas)))
1163-
return -ENOMEM;
1164-
1165-
return 0;
1166-
}
1167-
11681157
static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi)
11691158
{
11701159
return mas_walk(&vmi->mas);

mm/memory.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,8 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
374374
* be 0. This will underflow and is okay.
375375
*/
376376
next = mas_find(mas, ceiling - 1);
377+
if (unlikely(xa_is_zero(next)))
378+
next = NULL;
377379

378380
/*
379381
* Hide vma from rmap and truncate_pagecache before freeing
@@ -395,6 +397,8 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
395397
&& !is_vm_hugetlb_page(next)) {
396398
vma = next;
397399
next = mas_find(mas, ceiling - 1);
400+
if (unlikely(xa_is_zero(next)))
401+
next = NULL;
398402
if (mm_wr_locked)
399403
vma_start_write(vma);
400404
unlink_anon_vmas(vma);
@@ -1744,7 +1748,8 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
17441748
unmap_single_vma(tlb, vma, start, end, &details,
17451749
mm_wr_locked);
17461750
hugetlb_zap_end(vma, &details);
1747-
} while ((vma = mas_find(mas, tree_end - 1)) != NULL);
1751+
vma = mas_find(mas, tree_end - 1);
1752+
} while (vma && likely(!xa_is_zero(vma)));
17481753
mmu_notifier_invalidate_range_end(&range);
17491754
}
17501755

mm/mmap.c

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3294,10 +3294,11 @@ void exit_mmap(struct mm_struct *mm)
32943294
arch_exit_mmap(mm);
32953295

32963296
vma = mas_find(&mas, ULONG_MAX);
3297-
if (!vma) {
3297+
if (!vma || unlikely(xa_is_zero(vma))) {
32983298
/* Can happen if dup_mmap() received an OOM */
32993299
mmap_read_unlock(mm);
3300-
return;
3300+
mmap_write_lock(mm);
3301+
goto destroy;
33013302
}
33023303

33033304
lru_add_drain();
@@ -3332,11 +3333,13 @@ void exit_mmap(struct mm_struct *mm)
33323333
remove_vma(vma, true);
33333334
count++;
33343335
cond_resched();
3335-
} while ((vma = mas_find(&mas, ULONG_MAX)) != NULL);
3336+
vma = mas_find(&mas, ULONG_MAX);
3337+
} while (vma && likely(!xa_is_zero(vma)));
33363338

33373339
BUG_ON(count != mm->map_count);
33383340

33393341
trace_exit_mmap(mm);
3342+
destroy:
33403343
__mt_destroy(&mm->mm_mt);
33413344
mmap_write_unlock(mm);
33423345
vm_unacct_memory(nr_accounted);

0 commit comments

Comments
 (0)