Skip to content

Commit af7f588

Browse files
compudjPeter Zijlstra
authored andcommitted
sched: Introduce per-memory-map concurrency ID
This feature allows the scheduler to expose a per-memory map concurrency ID to user-space. This concurrency ID is within the possible cpus range, and is temporarily (and uniquely) assigned while threads are actively running within a memory map. If a memory map has fewer threads than cores, or is limited to run on few cores concurrently through sched affinity or cgroup cpusets, the concurrency IDs will be values close to 0, thus allowing efficient use of user-space memory for per-cpu data structures. This feature is meant to be exposed by a new rseq thread area field. The primary purpose of this feature is to do the heavy-lifting needed by memory allocators to allow them to use per-cpu data structures efficiently in the following situations: - Single-threaded applications, - Multi-threaded applications on large systems (many cores) with limited cpu affinity mask, - Multi-threaded applications on large systems (many cores) with restricted cgroup cpuset per container. One of the key concern from scheduler maintainers is the overhead associated with additional spin locks or atomic operations in the scheduler fast-path. This is why the following optimization is implemented. On context switch between threads belonging to the same memory map, transfer the mm_cid from prev to next without any atomic ops. This takes care of use-cases involving frequent context switch between threads belonging to the same memory map. Additional optimizations can be done if the spin locks added when context switching between threads belonging to different memory maps end up being a performance bottleneck. Those are left out of this patch though. A performance impact would have to be clearly demonstrated to justify the added complexity. The credit goes to Paul Turner (Google) for the original virtual cpu id idea. This feature is implemented based on the discussions with Paul Turner and Peter Oskolkov (Google), but I took the liberty to implement scheduler fast-path optimizations and my own NUMA-awareness scheme. The rumor has it that Google have been running a rseq vcpu_id extension internally in production for a year. The tcmalloc source code indeed has comments hinting at a vcpu_id prototype extension to the rseq system call [1]. The following benchmarks do not show any significant overhead added to the scheduler context switch by this feature: * perf bench sched messaging (process) Baseline: 86.5±0.3 ms With mm_cid: 86.7±2.6 ms * perf bench sched messaging (threaded) Baseline: 84.3±3.0 ms With mm_cid: 84.7±2.6 ms * hackbench (process) Baseline: 82.9±2.7 ms With mm_cid: 82.9±2.9 ms * hackbench (threaded) Baseline: 85.2±2.6 ms With mm_cid: 84.4±2.9 ms [1] https://github.com/google/tcmalloc/blob/master/tcmalloc/internal/linux_syscall_support.h#L26 Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20221122203932.231377-8-mathieu.desnoyers@efficios.com
1 parent 99babd0 commit af7f588

File tree

9 files changed

+198
-2
lines changed

9 files changed

+198
-2
lines changed

fs/exec.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1010,6 +1010,7 @@ static int exec_mmap(struct mm_struct *mm)
10101010
active_mm = tsk->active_mm;
10111011
tsk->active_mm = mm;
10121012
tsk->mm = mm;
1013+
mm_init_cid(mm);
10131014
/*
10141015
* This prevents preemption while active_mm is being loaded and
10151016
* it and mm are being updated, which could cause problems for
@@ -1822,6 +1823,7 @@ static int bprm_execve(struct linux_binprm *bprm,
18221823
*/
18231824
check_unsafe_exec(bprm);
18241825
current->in_execve = 1;
1826+
sched_mm_cid_before_execve(current);
18251827

18261828
file = do_open_execat(fd, filename, flags);
18271829
retval = PTR_ERR(file);
@@ -1852,6 +1854,7 @@ static int bprm_execve(struct linux_binprm *bprm,
18521854
if (retval < 0)
18531855
goto out;
18541856

1857+
sched_mm_cid_after_execve(current);
18551858
/* execve succeeded */
18561859
current->fs->in_exec = 0;
18571860
current->in_execve = 0;
@@ -1871,6 +1874,7 @@ static int bprm_execve(struct linux_binprm *bprm,
18711874
force_fatal_sig(SIGSEGV);
18721875

18731876
out_unmark:
1877+
sched_mm_cid_after_execve(current);
18741878
current->fs->in_exec = 0;
18751879
current->in_execve = 0;
18761880

include/linux/mm.h

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1976,6 +1976,31 @@ struct zap_details {
19761976
/* Set in unmap_vmas() to indicate a final unmap call. Only used by hugetlb */
19771977
#define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1))
19781978

1979+
#ifdef CONFIG_SCHED_MM_CID
1980+
void sched_mm_cid_before_execve(struct task_struct *t);
1981+
void sched_mm_cid_after_execve(struct task_struct *t);
1982+
void sched_mm_cid_fork(struct task_struct *t);
1983+
void sched_mm_cid_exit_signals(struct task_struct *t);
1984+
static inline int task_mm_cid(struct task_struct *t)
1985+
{
1986+
return t->mm_cid;
1987+
}
1988+
#else
1989+
static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
1990+
static inline void sched_mm_cid_after_execve(struct task_struct *t) { }
1991+
static inline void sched_mm_cid_fork(struct task_struct *t) { }
1992+
static inline void sched_mm_cid_exit_signals(struct task_struct *t) { }
1993+
static inline int task_mm_cid(struct task_struct *t)
1994+
{
1995+
/*
1996+
* Use the processor id as a fall-back when the mm cid feature is
1997+
* disabled. This provides functional per-cpu data structure accesses
1998+
* in user-space, althrough it won't provide the memory usage benefits.
1999+
*/
2000+
return raw_smp_processor_id();
2001+
}
2002+
#endif
2003+
19792004
#ifdef CONFIG_MMU
19802005
extern bool can_do_mlock(void);
19812006
#else

include/linux/mm_types.h

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -645,7 +645,18 @@ struct mm_struct {
645645
* &struct mm_struct is freed.
646646
*/
647647
atomic_t mm_count;
648-
648+
#ifdef CONFIG_SCHED_MM_CID
649+
/**
650+
* @cid_lock: Protect cid bitmap updates vs lookups.
651+
*
652+
* Prevent situations where updates to the cid bitmap happen
653+
* concurrently with lookups. Those can lead to situations
654+
* where a lookup cannot find a free bit simply because it was
655+
* unlucky enough to load, non-atomically, bitmap words as they
656+
* were being concurrently updated by the updaters.
657+
*/
658+
raw_spinlock_t cid_lock;
659+
#endif
649660
#ifdef CONFIG_MMU
650661
atomic_long_t pgtables_bytes; /* PTE page table pages */
651662
#endif
@@ -909,6 +920,36 @@ static inline void vma_iter_init(struct vma_iterator *vmi,
909920
vmi->mas.node = MAS_START;
910921
}
911922

923+
#ifdef CONFIG_SCHED_MM_CID
924+
/* Accessor for struct mm_struct's cidmask. */
925+
static inline cpumask_t *mm_cidmask(struct mm_struct *mm)
926+
{
927+
unsigned long cid_bitmap = (unsigned long)mm;
928+
929+
cid_bitmap += offsetof(struct mm_struct, cpu_bitmap);
930+
/* Skip cpu_bitmap */
931+
cid_bitmap += cpumask_size();
932+
return (struct cpumask *)cid_bitmap;
933+
}
934+
935+
static inline void mm_init_cid(struct mm_struct *mm)
936+
{
937+
raw_spin_lock_init(&mm->cid_lock);
938+
cpumask_clear(mm_cidmask(mm));
939+
}
940+
941+
static inline unsigned int mm_cid_size(void)
942+
{
943+
return cpumask_size();
944+
}
945+
#else /* CONFIG_SCHED_MM_CID */
946+
static inline void mm_init_cid(struct mm_struct *mm) { }
947+
static inline unsigned int mm_cid_size(void)
948+
{
949+
return 0;
950+
}
951+
#endif /* CONFIG_SCHED_MM_CID */
952+
912953
struct mmu_gather;
913954
extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
914955
extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);

include/linux/sched.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1311,6 +1311,11 @@ struct task_struct {
13111311
unsigned long rseq_event_mask;
13121312
#endif
13131313

1314+
#ifdef CONFIG_SCHED_MM_CID
1315+
int mm_cid; /* Current cid in mm */
1316+
int mm_cid_active; /* Whether cid bitmap is active */
1317+
#endif
1318+
13141319
struct tlbflush_unmap_batch tlb_ubc;
13151320

13161321
union {

init/Kconfig

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1041,6 +1041,10 @@ config RT_GROUP_SCHED
10411041

10421042
endif #CGROUP_SCHED
10431043

1044+
config SCHED_MM_CID
1045+
def_bool y
1046+
depends on SMP && RSEQ
1047+
10441048
config UCLAMP_TASK_GROUP
10451049
bool "Utilization clamping per group of tasks"
10461050
depends on CGROUP_SCHED

kernel/fork.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1060,6 +1060,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
10601060
tsk->reported_split_lock = 0;
10611061
#endif
10621062

1063+
#ifdef CONFIG_SCHED_MM_CID
1064+
tsk->mm_cid = -1;
1065+
tsk->mm_cid_active = 0;
1066+
#endif
10631067
return tsk;
10641068

10651069
free_stack:
@@ -1169,6 +1173,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
11691173

11701174
mm->user_ns = get_user_ns(user_ns);
11711175
lru_gen_init_mm(mm);
1176+
mm_init_cid(mm);
11721177
return mm;
11731178

11741179
fail_pcpu:
@@ -1601,6 +1606,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
16011606

16021607
tsk->mm = mm;
16031608
tsk->active_mm = mm;
1609+
sched_mm_cid_fork(tsk);
16041610
return 0;
16051611
}
16061612

@@ -3034,7 +3040,7 @@ void __init mm_cache_init(void)
30343040
* dynamically sized based on the maximum CPU number this system
30353041
* can have, taking hotplug into account (nr_cpu_ids).
30363042
*/
3037-
mm_size = sizeof(struct mm_struct) + cpumask_size();
3043+
mm_size = sizeof(struct mm_struct) + cpumask_size() + mm_cid_size();
30383044

30393045
mm_cachep = kmem_cache_create_usercopy("mm_struct",
30403046
mm_size, ARCH_MIN_MMSTRUCT_ALIGN,

kernel/sched/core.c

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5052,6 +5052,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
50525052
sched_info_switch(rq, prev, next);
50535053
perf_event_task_sched_out(prev, next);
50545054
rseq_preempt(prev);
5055+
switch_mm_cid(prev, next);
50555056
fire_sched_out_preempt_notifiers(prev, next);
50565057
kmap_local_sched_out();
50575058
prepare_task(next);
@@ -11305,3 +11306,53 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
1130511306
{
1130611307
trace_sched_update_nr_running_tp(rq, count);
1130711308
}
11309+
11310+
#ifdef CONFIG_SCHED_MM_CID
11311+
void sched_mm_cid_exit_signals(struct task_struct *t)
11312+
{
11313+
struct mm_struct *mm = t->mm;
11314+
unsigned long flags;
11315+
11316+
if (!mm)
11317+
return;
11318+
local_irq_save(flags);
11319+
mm_cid_put(mm, t->mm_cid);
11320+
t->mm_cid = -1;
11321+
t->mm_cid_active = 0;
11322+
local_irq_restore(flags);
11323+
}
11324+
11325+
void sched_mm_cid_before_execve(struct task_struct *t)
11326+
{
11327+
struct mm_struct *mm = t->mm;
11328+
unsigned long flags;
11329+
11330+
if (!mm)
11331+
return;
11332+
local_irq_save(flags);
11333+
mm_cid_put(mm, t->mm_cid);
11334+
t->mm_cid = -1;
11335+
t->mm_cid_active = 0;
11336+
local_irq_restore(flags);
11337+
}
11338+
11339+
void sched_mm_cid_after_execve(struct task_struct *t)
11340+
{
11341+
struct mm_struct *mm = t->mm;
11342+
unsigned long flags;
11343+
11344+
WARN_ON_ONCE((t->flags & PF_KTHREAD) || !t->mm);
11345+
11346+
local_irq_save(flags);
11347+
t->mm_cid = mm_cid_get(mm);
11348+
t->mm_cid_active = 1;
11349+
local_irq_restore(flags);
11350+
rseq_set_notify_resume(t);
11351+
}
11352+
11353+
void sched_mm_cid_fork(struct task_struct *t)
11354+
{
11355+
WARN_ON_ONCE((t->flags & PF_KTHREAD) || !t->mm || t->mm_cid != -1);
11356+
t->mm_cid_active = 1;
11357+
}
11358+
#endif

kernel/sched/sched.h

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3269,4 +3269,62 @@ static inline void update_current_exec_runtime(struct task_struct *curr,
32693269
cgroup_account_cputime(curr, delta_exec);
32703270
}
32713271

3272+
#ifdef CONFIG_SCHED_MM_CID
3273+
static inline int __mm_cid_get(struct mm_struct *mm)
3274+
{
3275+
struct cpumask *cpumask;
3276+
int cid;
3277+
3278+
cpumask = mm_cidmask(mm);
3279+
cid = cpumask_first_zero(cpumask);
3280+
if (cid >= nr_cpu_ids)
3281+
return -1;
3282+
__cpumask_set_cpu(cid, cpumask);
3283+
return cid;
3284+
}
3285+
3286+
static inline void mm_cid_put(struct mm_struct *mm, int cid)
3287+
{
3288+
lockdep_assert_irqs_disabled();
3289+
if (cid < 0)
3290+
return;
3291+
raw_spin_lock(&mm->cid_lock);
3292+
__cpumask_clear_cpu(cid, mm_cidmask(mm));
3293+
raw_spin_unlock(&mm->cid_lock);
3294+
}
3295+
3296+
static inline int mm_cid_get(struct mm_struct *mm)
3297+
{
3298+
int ret;
3299+
3300+
lockdep_assert_irqs_disabled();
3301+
raw_spin_lock(&mm->cid_lock);
3302+
ret = __mm_cid_get(mm);
3303+
raw_spin_unlock(&mm->cid_lock);
3304+
return ret;
3305+
}
3306+
3307+
static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next)
3308+
{
3309+
if (prev->mm_cid_active) {
3310+
if (next->mm_cid_active && next->mm == prev->mm) {
3311+
/*
3312+
* Context switch between threads in same mm, hand over
3313+
* the mm_cid from prev to next.
3314+
*/
3315+
next->mm_cid = prev->mm_cid;
3316+
prev->mm_cid = -1;
3317+
return;
3318+
}
3319+
mm_cid_put(prev->mm, prev->mm_cid);
3320+
prev->mm_cid = -1;
3321+
}
3322+
if (next->mm_cid_active)
3323+
next->mm_cid = mm_cid_get(next->mm);
3324+
}
3325+
3326+
#else
3327+
static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { }
3328+
#endif
3329+
32723330
#endif /* _KERNEL_SCHED_SCHED_H */

kernel/signal.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2951,6 +2951,7 @@ void exit_signals(struct task_struct *tsk)
29512951
cgroup_threadgroup_change_begin(tsk);
29522952

29532953
if (thread_group_empty(tsk) || (tsk->signal->flags & SIGNAL_GROUP_EXIT)) {
2954+
sched_mm_cid_exit_signals(tsk);
29542955
tsk->flags |= PF_EXITING;
29552956
cgroup_threadgroup_change_end(tsk);
29562957
return;
@@ -2961,6 +2962,7 @@ void exit_signals(struct task_struct *tsk)
29612962
* From now this task is not visible for group-wide signals,
29622963
* see wants_signal(), do_signal_stop().
29632964
*/
2965+
sched_mm_cid_exit_signals(tsk);
29642966
tsk->flags |= PF_EXITING;
29652967

29662968
cgroup_threadgroup_change_end(tsk);

0 commit comments

Comments
 (0)