Skip to content

Commit 14c1da3

Browse files
committed
sched_ext: Allocate scx_kick_cpus_pnt_seqs lazily using kvzalloc()
On systems with >4096 CPUs, scx_kick_cpus_pnt_seqs allocation fails during boot because it exceeds the 32,768 byte percpu allocator limit. Restructure to use DEFINE_PER_CPU() for the per-CPU pointers, with each CPU pointing to its own kvzalloc'd array. Move allocation from boot time to scx_enable() and free in scx_disable(), so the O(nr_cpu_ids^2) memory is only consumed when sched_ext is active. Use RCU to guard against racing with free. Arrays are freed via call_rcu() and kick_cpus_irq_workfn() uses rcu_dereference_bh() with a NULL check. While at it, rename to scx_kick_pseqs for brevity and update comments to clarify these are pick_task sequence numbers. v2: RCU protect scx_kick_seqs to manage kick_cpus_irq_workfn() racing against disable as per Andrea. v3: Fix bugs notcied by Andrea. Reported-by: Phil Auld <pauld@redhat.com> Link: http://lkml.kernel.org/r/20251007133523.GA93086@pauld.westford.csb Cc: Andrea Righi <arighi@nvidia.com> Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com> Reviewed-by: Phil Auld <pauld@redhat.com> Reviewed-by: Andrea Righi <arighi@nvidia.com> Signed-off-by: Tejun Heo <tj@kernel.org>
1 parent a8ad873 commit 14c1da3

File tree

1 file changed

+79
-10
lines changed

1 file changed

+79
-10
lines changed

kernel/sched/ext.c

Lines changed: 79 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,19 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
6767

6868
static struct delayed_work scx_watchdog_work;
6969

70-
/* for %SCX_KICK_WAIT */
71-
static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
70+
/*
71+
* For %SCX_KICK_WAIT: Each CPU has a pointer to an array of pick_task sequence
72+
* numbers. The arrays are allocated with kvzalloc() as size can exceed percpu
73+
* allocator limits on large machines. O(nr_cpu_ids^2) allocation, allocated
74+
* lazily when enabling and freed when disabling to avoid waste when sched_ext
75+
* isn't active.
76+
*/
77+
struct scx_kick_pseqs {
78+
struct rcu_head rcu;
79+
unsigned long seqs[];
80+
};
81+
82+
static DEFINE_PER_CPU(struct scx_kick_pseqs __rcu *, scx_kick_pseqs);
7283

7384
/*
7485
* Direct dispatch marker.
@@ -3877,6 +3888,27 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
38773888
}
38783889
}
38793890

3891+
static void free_kick_pseqs_rcu(struct rcu_head *rcu)
3892+
{
3893+
struct scx_kick_pseqs *pseqs = container_of(rcu, struct scx_kick_pseqs, rcu);
3894+
3895+
kvfree(pseqs);
3896+
}
3897+
3898+
static void free_kick_pseqs(void)
3899+
{
3900+
int cpu;
3901+
3902+
for_each_possible_cpu(cpu) {
3903+
struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu);
3904+
struct scx_kick_pseqs *to_free;
3905+
3906+
to_free = rcu_replace_pointer(*pseqs, NULL, true);
3907+
if (to_free)
3908+
call_rcu(&to_free->rcu, free_kick_pseqs_rcu);
3909+
}
3910+
}
3911+
38803912
static void scx_disable_workfn(struct kthread_work *work)
38813913
{
38823914
struct scx_sched *sch = container_of(work, struct scx_sched, disable_work);
@@ -4013,6 +4045,7 @@ static void scx_disable_workfn(struct kthread_work *work)
40134045
free_percpu(scx_dsp_ctx);
40144046
scx_dsp_ctx = NULL;
40154047
scx_dsp_max_batch = 0;
4048+
free_kick_pseqs();
40164049

40174050
mutex_unlock(&scx_enable_mutex);
40184051

@@ -4375,6 +4408,33 @@ static void scx_vexit(struct scx_sched *sch,
43754408
irq_work_queue(&sch->error_irq_work);
43764409
}
43774410

4411+
static int alloc_kick_pseqs(void)
4412+
{
4413+
int cpu;
4414+
4415+
/*
4416+
* Allocate per-CPU arrays sized by nr_cpu_ids. Use kvzalloc as size
4417+
* can exceed percpu allocator limits on large machines.
4418+
*/
4419+
for_each_possible_cpu(cpu) {
4420+
struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu);
4421+
struct scx_kick_pseqs *new_pseqs;
4422+
4423+
WARN_ON_ONCE(rcu_access_pointer(*pseqs));
4424+
4425+
new_pseqs = kvzalloc_node(struct_size(new_pseqs, seqs, nr_cpu_ids),
4426+
GFP_KERNEL, cpu_to_node(cpu));
4427+
if (!new_pseqs) {
4428+
free_kick_pseqs();
4429+
return -ENOMEM;
4430+
}
4431+
4432+
rcu_assign_pointer(*pseqs, new_pseqs);
4433+
}
4434+
4435+
return 0;
4436+
}
4437+
43784438
static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
43794439
{
43804440
struct scx_sched *sch;
@@ -4517,15 +4577,19 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
45174577

45184578
mutex_lock(&scx_enable_mutex);
45194579

4580+
ret = alloc_kick_pseqs();
4581+
if (ret)
4582+
goto err_unlock;
4583+
45204584
if (scx_enable_state() != SCX_DISABLED) {
45214585
ret = -EBUSY;
4522-
goto err_unlock;
4586+
goto err_free_pseqs;
45234587
}
45244588

45254589
sch = scx_alloc_and_add_sched(ops);
45264590
if (IS_ERR(sch)) {
45274591
ret = PTR_ERR(sch);
4528-
goto err_unlock;
4592+
goto err_free_pseqs;
45294593
}
45304594

45314595
/*
@@ -4728,6 +4792,8 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
47284792

47294793
return 0;
47304794

4795+
err_free_pseqs:
4796+
free_kick_pseqs();
47314797
err_unlock:
47324798
mutex_unlock(&scx_enable_mutex);
47334799
return ret;
@@ -5109,10 +5175,18 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
51095175
{
51105176
struct rq *this_rq = this_rq();
51115177
struct scx_rq *this_scx = &this_rq->scx;
5112-
unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs);
5178+
struct scx_kick_pseqs __rcu *pseqs_pcpu = __this_cpu_read(scx_kick_pseqs);
51135179
bool should_wait = false;
5180+
unsigned long *pseqs;
51145181
s32 cpu;
51155182

5183+
if (unlikely(!pseqs_pcpu)) {
5184+
pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_pseqs");
5185+
return;
5186+
}
5187+
5188+
pseqs = rcu_dereference_bh(pseqs_pcpu)->seqs;
5189+
51165190
for_each_cpu(cpu, this_scx->cpus_to_kick) {
51175191
should_wait |= kick_one_cpu(cpu, this_rq, pseqs);
51185192
cpumask_clear_cpu(cpu, this_scx->cpus_to_kick);
@@ -5235,11 +5309,6 @@ void __init init_sched_ext_class(void)
52355309

52365310
scx_idle_init_masks();
52375311

5238-
scx_kick_cpus_pnt_seqs =
5239-
__alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids,
5240-
__alignof__(scx_kick_cpus_pnt_seqs[0]));
5241-
BUG_ON(!scx_kick_cpus_pnt_seqs);
5242-
52435312
for_each_possible_cpu(cpu) {
52445313
struct rq *rq = cpu_rq(cpu);
52455314
int n = cpu_to_node(cpu);

0 commit comments

Comments
 (0)