Skip to content

Commit 223baf9

Browse files
compudjPeter Zijlstra
authored andcommitted
sched: Fix performance regression introduced by mm_cid
Introduce per-mm/cpu current concurrency id (mm_cid) to fix a PostgreSQL sysbench regression reported by Aaron Lu. Keep track of the currently allocated mm_cid for each mm/cpu rather than freeing them immediately on context switch. This eliminates most atomic operations when context switching back and forth between threads belonging to different memory spaces in multi-threaded scenarios (many processes, each with many threads). The per-mm/per-cpu mm_cid values are serialized by their respective runqueue locks. Thread migration is handled by introducing invocation to sched_mm_cid_migrate_to() (with destination runqueue lock held) in activate_task() for migrating tasks. If the destination cpu's mm_cid is unset, and if the source runqueue is not actively using its mm_cid, then the source cpu's mm_cid is moved to the destination cpu on migration. Introduce a task-work executed periodically, similarly to NUMA work, which delays reclaim of cid values when they are unused for a period of time. Keep track of the allocation time for each per-cpu cid, and let the task work clear them when they are observed to be older than SCHED_MM_CID_PERIOD_NS and unused. This task work also clears all mm_cids which are greater or equal to the Hamming weight of the mm cidmask to keep concurrency ids compact. Because we want to ensure the mm_cid converges towards the smaller values as migrations happen, the prior optimization that was done when context switching between threads belonging to the same mm is removed, because it could delay the lazy release of the destination runqueue mm_cid after it has been replaced by a migration. Removing this prior optimization is not an issue performance-wise because the introduced per-mm/per-cpu mm_cid tracking also covers this more specific case. Fixes: af7f588 ("sched: Introduce per-memory-map concurrency ID") Reported-by: Aaron Lu <aaron.lu@intel.com> Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Tested-by: Aaron Lu <aaron.lu@intel.com> Link: https://lore.kernel.org/lkml/20230327080502.GA570847@ziqianlu-desk2/
1 parent 5a4d3b3 commit 223baf9

File tree

6 files changed

+804
-57
lines changed

6 files changed

+804
-57
lines changed

include/linux/mm_types.h

Lines changed: 74 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -550,6 +550,13 @@ struct vm_area_struct {
550550
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
551551
} __randomize_layout;
552552

553+
#ifdef CONFIG_SCHED_MM_CID
554+
struct mm_cid {
555+
u64 time;
556+
int cid;
557+
};
558+
#endif
559+
553560
struct kioctx_table;
554561
struct mm_struct {
555562
struct {
@@ -600,15 +607,19 @@ struct mm_struct {
600607
atomic_t mm_count;
601608
#ifdef CONFIG_SCHED_MM_CID
602609
/**
603-
* @cid_lock: Protect cid bitmap updates vs lookups.
610+
* @pcpu_cid: Per-cpu current cid.
604611
*
605-
* Prevent situations where updates to the cid bitmap happen
606-
* concurrently with lookups. Those can lead to situations
607-
* where a lookup cannot find a free bit simply because it was
608-
* unlucky enough to load, non-atomically, bitmap words as they
609-
* were being concurrently updated by the updaters.
612+
* Keep track of the currently allocated mm_cid for each cpu.
613+
* The per-cpu mm_cid values are serialized by their respective
614+
* runqueue locks.
610615
*/
611-
raw_spinlock_t cid_lock;
616+
struct mm_cid __percpu *pcpu_cid;
617+
/*
618+
* @mm_cid_next_scan: Next mm_cid scan (in jiffies).
619+
*
620+
* When the next mm_cid scan is due (in jiffies).
621+
*/
622+
unsigned long mm_cid_next_scan;
612623
#endif
613624
#ifdef CONFIG_MMU
614625
atomic_long_t pgtables_bytes; /* size of all page tables */
@@ -873,6 +884,37 @@ static inline void vma_iter_init(struct vma_iterator *vmi,
873884
}
874885

875886
#ifdef CONFIG_SCHED_MM_CID
887+
888+
enum mm_cid_state {
889+
MM_CID_UNSET = -1U, /* Unset state has lazy_put flag set. */
890+
MM_CID_LAZY_PUT = (1U << 31),
891+
};
892+
893+
static inline bool mm_cid_is_unset(int cid)
894+
{
895+
return cid == MM_CID_UNSET;
896+
}
897+
898+
static inline bool mm_cid_is_lazy_put(int cid)
899+
{
900+
return !mm_cid_is_unset(cid) && (cid & MM_CID_LAZY_PUT);
901+
}
902+
903+
static inline bool mm_cid_is_valid(int cid)
904+
{
905+
return !(cid & MM_CID_LAZY_PUT);
906+
}
907+
908+
static inline int mm_cid_set_lazy_put(int cid)
909+
{
910+
return cid | MM_CID_LAZY_PUT;
911+
}
912+
913+
static inline int mm_cid_clear_lazy_put(int cid)
914+
{
915+
return cid & ~MM_CID_LAZY_PUT;
916+
}
917+
876918
/* Accessor for struct mm_struct's cidmask. */
877919
static inline cpumask_t *mm_cidmask(struct mm_struct *mm)
878920
{
@@ -886,16 +928,40 @@ static inline cpumask_t *mm_cidmask(struct mm_struct *mm)
886928

887929
static inline void mm_init_cid(struct mm_struct *mm)
888930
{
889-
raw_spin_lock_init(&mm->cid_lock);
931+
int i;
932+
933+
for_each_possible_cpu(i) {
934+
struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, i);
935+
936+
pcpu_cid->cid = MM_CID_UNSET;
937+
pcpu_cid->time = 0;
938+
}
890939
cpumask_clear(mm_cidmask(mm));
891940
}
892941

942+
static inline int mm_alloc_cid(struct mm_struct *mm)
943+
{
944+
mm->pcpu_cid = alloc_percpu(struct mm_cid);
945+
if (!mm->pcpu_cid)
946+
return -ENOMEM;
947+
mm_init_cid(mm);
948+
return 0;
949+
}
950+
951+
static inline void mm_destroy_cid(struct mm_struct *mm)
952+
{
953+
free_percpu(mm->pcpu_cid);
954+
mm->pcpu_cid = NULL;
955+
}
956+
893957
static inline unsigned int mm_cid_size(void)
894958
{
895959
return cpumask_size();
896960
}
897961
#else /* CONFIG_SCHED_MM_CID */
898962
static inline void mm_init_cid(struct mm_struct *mm) { }
963+
static inline int mm_alloc_cid(struct mm_struct *mm) { return 0; }
964+
static inline void mm_destroy_cid(struct mm_struct *mm) { }
899965
static inline unsigned int mm_cid_size(void)
900966
{
901967
return 0;

include/linux/sched.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1314,7 +1314,10 @@ struct task_struct {
13141314

13151315
#ifdef CONFIG_SCHED_MM_CID
13161316
int mm_cid; /* Current cid in mm */
1317+
int last_mm_cid; /* Most recent cid in mm */
1318+
int migrate_from_cpu;
13171319
int mm_cid_active; /* Whether cid bitmap is active */
1320+
struct callback_head cid_work;
13181321
#endif
13191322

13201323
struct tlbflush_unmap_batch tlb_ubc;

include/linux/sched/mm.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,11 @@ static inline void mmgrab(struct mm_struct *mm)
3737
atomic_inc(&mm->mm_count);
3838
}
3939

40+
static inline void smp_mb__after_mmgrab(void)
41+
{
42+
smp_mb__after_atomic();
43+
}
44+
4045
extern void __mmdrop(struct mm_struct *mm);
4146

4247
static inline void mmdrop(struct mm_struct *mm)

kernel/fork.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -793,6 +793,7 @@ void __mmdrop(struct mm_struct *mm)
793793
check_mm(mm);
794794
put_user_ns(mm->user_ns);
795795
mm_pasid_drop(mm);
796+
mm_destroy_cid(mm);
796797

797798
for (i = 0; i < NR_MM_COUNTERS; i++)
798799
percpu_counter_destroy(&mm->rss_stat[i]);
@@ -1057,7 +1058,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
10571058

10581059
#ifdef CONFIG_SCHED_MM_CID
10591060
tsk->mm_cid = -1;
1061+
tsk->last_mm_cid = -1;
10601062
tsk->mm_cid_active = 0;
1063+
tsk->migrate_from_cpu = -1;
10611064
#endif
10621065
return tsk;
10631066

@@ -1162,18 +1165,22 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
11621165
if (init_new_context(p, mm))
11631166
goto fail_nocontext;
11641167

1168+
if (mm_alloc_cid(mm))
1169+
goto fail_cid;
1170+
11651171
for (i = 0; i < NR_MM_COUNTERS; i++)
11661172
if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT))
11671173
goto fail_pcpu;
11681174

11691175
mm->user_ns = get_user_ns(user_ns);
11701176
lru_gen_init_mm(mm);
1171-
mm_init_cid(mm);
11721177
return mm;
11731178

11741179
fail_pcpu:
11751180
while (i > 0)
11761181
percpu_counter_destroy(&mm->rss_stat[--i]);
1182+
mm_destroy_cid(mm);
1183+
fail_cid:
11771184
fail_nocontext:
11781185
mm_free_pgd(mm);
11791186
fail_nopgd:

0 commit comments

Comments
 (0)