Skip to content

Commit 245254f

Browse files
Byte-Labhtejun
authored andcommitted
sched_ext: Implement sched_ext_ops.cpu_acquire/release()
Scheduler classes are strictly ordered and when a higher priority class has tasks to run, the lower priority ones lose access to the CPU. Being able to monitor and act on these events are necessary for use cases includling strict core-scheduling and latency management. This patch adds two operations ops.cpu_acquire() and .cpu_release(). The former is invoked when a CPU becomes available to the BPF scheduler and the opposite for the latter. This patch also implements scx_bpf_reenqueue_local() which can be called from .cpu_release() to trigger requeueing of all tasks in the local dsq of the CPU so that the tasks can be reassigned to other available CPUs. scx_pair is updated to use .cpu_acquire/release() along with %SCX_KICK_WAIT to make the pair scheduling guarantee strict even when a CPU is preempted by a higher priority scheduler class. scx_qmap is updated to use .cpu_acquire/release() to empty the local dsq of a preempted CPU. A similar approach can be adopted by BPF schedulers that want to have a tight control over latency. v4: Use the new SCX_KICK_IDLE to wake up a CPU after re-enqueueing. v3: Drop the const qualifier from scx_cpu_release_args.task. BPF enforces access control through the verifier, so the qualifier isn't actually operative and only gets in the way when interacting with various helpers. v2: Add p->scx.kf_mask annotation to allow calling scx_bpf_reenqueue_local() from ops.cpu_release() nested inside ops.init() and other sleepable operations. Signed-off-by: David Vernet <dvernet@meta.com> Reviewed-by: Tejun Heo <tj@kernel.org> Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Josh Don <joshdon@google.com> Acked-by: Hao Luo <haoluo@google.com> Acked-by: Barret Rhoden <brho@google.com>
1 parent 90e5516 commit 245254f

File tree

7 files changed

+240
-7
lines changed

7 files changed

+240
-7
lines changed

include/linux/sched/ext.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,13 +98,15 @@ enum scx_kf_mask {
9898
SCX_KF_UNLOCKED = 0, /* not sleepable, not rq locked */
9999
/* all non-sleepables may be nested inside SLEEPABLE */
100100
SCX_KF_SLEEPABLE = 1 << 0, /* sleepable init operations */
101+
/* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */
102+
SCX_KF_CPU_RELEASE = 1 << 1, /* ops.cpu_release() */
101103
/* ops.dequeue (in REST) may be nested inside DISPATCH */
102104
SCX_KF_DISPATCH = 1 << 2, /* ops.dispatch() */
103105
SCX_KF_ENQUEUE = 1 << 3, /* ops.enqueue() and ops.select_cpu() */
104106
SCX_KF_SELECT_CPU = 1 << 4, /* ops.select_cpu() */
105107
SCX_KF_REST = 1 << 5, /* other rq-locked operations */
106108

107-
__SCX_KF_RQ_LOCKED = SCX_KF_DISPATCH |
109+
__SCX_KF_RQ_LOCKED = SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH |
108110
SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
109111
__SCX_KF_TERMINAL = SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
110112
};

kernel/sched/ext.c

Lines changed: 195 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,32 @@ struct scx_exit_task_args {
110110
bool cancelled;
111111
};
112112

113+
enum scx_cpu_preempt_reason {
114+
/* next task is being scheduled by &sched_class_rt */
115+
SCX_CPU_PREEMPT_RT,
116+
/* next task is being scheduled by &sched_class_dl */
117+
SCX_CPU_PREEMPT_DL,
118+
/* next task is being scheduled by &sched_class_stop */
119+
SCX_CPU_PREEMPT_STOP,
120+
/* unknown reason for SCX being preempted */
121+
SCX_CPU_PREEMPT_UNKNOWN,
122+
};
123+
124+
/*
125+
* Argument container for ops->cpu_acquire(). Currently empty, but may be
126+
* expanded in the future.
127+
*/
128+
struct scx_cpu_acquire_args {};
129+
130+
/* argument container for ops->cpu_release() */
131+
struct scx_cpu_release_args {
132+
/* the reason the CPU was preempted */
133+
enum scx_cpu_preempt_reason reason;
134+
135+
/* the task that's going to be scheduled on the CPU */
136+
struct task_struct *task;
137+
};
138+
113139
/*
114140
* Informational context provided to dump operations.
115141
*/
@@ -335,6 +361,28 @@ struct sched_ext_ops {
335361
*/
336362
void (*update_idle)(s32 cpu, bool idle);
337363

364+
/**
365+
* cpu_acquire - A CPU is becoming available to the BPF scheduler
366+
* @cpu: The CPU being acquired by the BPF scheduler.
367+
* @args: Acquire arguments, see the struct definition.
368+
*
369+
* A CPU that was previously released from the BPF scheduler is now once
370+
* again under its control.
371+
*/
372+
void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args);
373+
374+
/**
375+
* cpu_release - A CPU is taken away from the BPF scheduler
376+
* @cpu: The CPU being released by the BPF scheduler.
377+
* @args: Release arguments, see the struct definition.
378+
*
379+
* The specified CPU is no longer under the control of the BPF
380+
* scheduler. This could be because it was preempted by a higher
381+
* priority sched_class, though there may be other reasons as well. The
382+
* caller should consult @args->reason to determine the cause.
383+
*/
384+
void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args);
385+
338386
/**
339387
* init_task - Initialize a task to run in a BPF scheduler
340388
* @p: task to initialize for BPF scheduling
@@ -487,6 +535,17 @@ enum scx_enq_flags {
487535
*/
488536
SCX_ENQ_PREEMPT = 1LLU << 32,
489537

538+
/*
539+
* The task being enqueued was previously enqueued on the current CPU's
540+
* %SCX_DSQ_LOCAL, but was removed from it in a call to the
541+
* bpf_scx_reenqueue_local() kfunc. If bpf_scx_reenqueue_local() was
542+
* invoked in a ->cpu_release() callback, and the task is again
543+
* dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the
544+
* task will not be scheduled on the CPU until at least the next invocation
545+
* of the ->cpu_acquire() callback.
546+
*/
547+
SCX_ENQ_REENQ = 1LLU << 40,
548+
490549
/*
491550
* The task being enqueued is the only task available for the cpu. By
492551
* default, ext core keeps executing such tasks but when
@@ -625,6 +684,7 @@ static bool scx_warned_zero_slice;
625684

626685
static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);
627686
static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
687+
DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
628688
static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
629689

630690
struct static_key_false scx_has_op[SCX_OPI_END] =
@@ -887,6 +947,12 @@ static __always_inline bool scx_kf_allowed(u32 mask)
887947
* inside ops.dispatch(). We don't need to check the SCX_KF_SLEEPABLE
888948
* boundary thanks to the above in_interrupt() check.
889949
*/
950+
if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE &&
951+
(current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) {
952+
scx_ops_error("cpu_release kfunc called from a nested operation");
953+
return false;
954+
}
955+
890956
if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH &&
891957
(current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) {
892958
scx_ops_error("dispatch kfunc called from a nested operation");
@@ -2070,14 +2136,29 @@ static int balance_scx(struct rq *rq, struct task_struct *prev,
20702136
lockdep_assert_rq_held(rq);
20712137
rq->scx.flags |= SCX_RQ_BALANCING;
20722138

2139+
if (static_branch_unlikely(&scx_ops_cpu_preempt) &&
2140+
unlikely(rq->scx.cpu_released)) {
2141+
/*
2142+
* If the previous sched_class for the current CPU was not SCX,
2143+
* notify the BPF scheduler that it again has control of the
2144+
* core. This callback complements ->cpu_release(), which is
2145+
* emitted in scx_next_task_picked().
2146+
*/
2147+
if (SCX_HAS_OP(cpu_acquire))
2148+
SCX_CALL_OP(0, cpu_acquire, cpu_of(rq), NULL);
2149+
rq->scx.cpu_released = false;
2150+
}
2151+
20732152
if (prev_on_scx) {
20742153
WARN_ON_ONCE(prev->scx.flags & SCX_TASK_BAL_KEEP);
20752154
update_curr_scx(rq);
20762155

20772156
/*
20782157
* If @prev is runnable & has slice left, it has priority and
20792158
* fetching more just increases latency for the fetched tasks.
2080-
* Tell put_prev_task_scx() to put @prev on local_dsq.
2159+
* Tell put_prev_task_scx() to put @prev on local_dsq. If the
2160+
* BPF scheduler wants to handle this explicitly, it should
2161+
* implement ->cpu_released().
20812162
*
20822163
* See scx_ops_disable_workfn() for the explanation on the
20832164
* bypassing test.
@@ -2297,6 +2378,20 @@ static struct task_struct *pick_next_task_scx(struct rq *rq)
22972378
return p;
22982379
}
22992380

2381+
static enum scx_cpu_preempt_reason
2382+
preempt_reason_from_class(const struct sched_class *class)
2383+
{
2384+
#ifdef CONFIG_SMP
2385+
if (class == &stop_sched_class)
2386+
return SCX_CPU_PREEMPT_STOP;
2387+
#endif
2388+
if (class == &dl_sched_class)
2389+
return SCX_CPU_PREEMPT_DL;
2390+
if (class == &rt_sched_class)
2391+
return SCX_CPU_PREEMPT_RT;
2392+
return SCX_CPU_PREEMPT_UNKNOWN;
2393+
}
2394+
23002395
void scx_next_task_picked(struct rq *rq, struct task_struct *p,
23012396
const struct sched_class *active)
23022397
{
@@ -2312,6 +2407,40 @@ void scx_next_task_picked(struct rq *rq, struct task_struct *p,
23122407
*/
23132408
smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);
23142409
#endif
2410+
if (!static_branch_unlikely(&scx_ops_cpu_preempt))
2411+
return;
2412+
2413+
/*
2414+
* The callback is conceptually meant to convey that the CPU is no
2415+
* longer under the control of SCX. Therefore, don't invoke the
2416+
* callback if the CPU is is staying on SCX, or going idle (in which
2417+
* case the SCX scheduler has actively decided not to schedule any
2418+
* tasks on the CPU).
2419+
*/
2420+
if (likely(active >= &ext_sched_class))
2421+
return;
2422+
2423+
/*
2424+
* At this point we know that SCX was preempted by a higher priority
2425+
* sched_class, so invoke the ->cpu_release() callback if we have not
2426+
* done so already. We only send the callback once between SCX being
2427+
* preempted, and it regaining control of the CPU.
2428+
*
2429+
* ->cpu_release() complements ->cpu_acquire(), which is emitted the
2430+
* next time that balance_scx() is invoked.
2431+
*/
2432+
if (!rq->scx.cpu_released) {
2433+
if (SCX_HAS_OP(cpu_release)) {
2434+
struct scx_cpu_release_args args = {
2435+
.reason = preempt_reason_from_class(active),
2436+
.task = p,
2437+
};
2438+
2439+
SCX_CALL_OP(SCX_KF_CPU_RELEASE,
2440+
cpu_release, cpu_of(rq), &args);
2441+
}
2442+
rq->scx.cpu_released = true;
2443+
}
23152444
}
23162445

23172446
#ifdef CONFIG_SMP
@@ -3398,6 +3527,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
33983527
static_branch_disable_cpuslocked(&scx_has_op[i]);
33993528
static_branch_disable_cpuslocked(&scx_ops_enq_last);
34003529
static_branch_disable_cpuslocked(&scx_ops_enq_exiting);
3530+
static_branch_disable_cpuslocked(&scx_ops_cpu_preempt);
34013531
static_branch_disable_cpuslocked(&scx_builtin_idle_enabled);
34023532
synchronize_rcu();
34033533

@@ -3699,9 +3829,10 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
36993829
seq_buf_init(&ns, buf, avail);
37003830

37013831
dump_newline(&ns);
3702-
dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x ops_qseq=%lu pnt_seq=%lu",
3832+
dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu pnt_seq=%lu",
37033833
cpu, rq->scx.nr_running, rq->scx.flags,
3704-
rq->scx.ops_qseq, rq->scx.pnt_seq);
3834+
rq->scx.cpu_released, rq->scx.ops_qseq,
3835+
rq->scx.pnt_seq);
37053836
dump_line(&ns, " curr=%s[%d] class=%ps",
37063837
rq->curr->comm, rq->curr->pid,
37073838
rq->curr->sched_class);
@@ -3942,6 +4073,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
39424073

39434074
if (ops->flags & SCX_OPS_ENQ_EXITING)
39444075
static_branch_enable_cpuslocked(&scx_ops_enq_exiting);
4076+
if (scx_ops.cpu_acquire || scx_ops.cpu_release)
4077+
static_branch_enable_cpuslocked(&scx_ops_cpu_preempt);
39454078

39464079
if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) {
39474080
reset_idle_masks();
@@ -4318,6 +4451,8 @@ static bool yield_stub(struct task_struct *from, struct task_struct *to) { retur
43184451
static void set_weight_stub(struct task_struct *p, u32 weight) {}
43194452
static void set_cpumask_stub(struct task_struct *p, const struct cpumask *mask) {}
43204453
static void update_idle_stub(s32 cpu, bool idle) {}
4454+
static void cpu_acquire_stub(s32 cpu, struct scx_cpu_acquire_args *args) {}
4455+
static void cpu_release_stub(s32 cpu, struct scx_cpu_release_args *args) {}
43214456
static s32 init_task_stub(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; }
43224457
static void exit_task_stub(struct task_struct *p, struct scx_exit_task_args *args) {}
43234458
static void enable_stub(struct task_struct *p) {}
@@ -4338,6 +4473,8 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
43384473
.set_weight = set_weight_stub,
43394474
.set_cpumask = set_cpumask_stub,
43404475
.update_idle = update_idle_stub,
4476+
.cpu_acquire = cpu_acquire_stub,
4477+
.cpu_release = cpu_release_stub,
43414478
.init_task = init_task_stub,
43424479
.exit_task = exit_task_stub,
43434480
.enable = enable_stub,
@@ -4870,6 +5007,59 @@ static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
48705007

48715008
__bpf_kfunc_start_defs();
48725009

5010+
/**
5011+
* scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
5012+
*
5013+
* Iterate over all of the tasks currently enqueued on the local DSQ of the
5014+
* caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of
5015+
* processed tasks. Can only be called from ops.cpu_release().
5016+
*/
5017+
__bpf_kfunc u32 scx_bpf_reenqueue_local(void)
5018+
{
5019+
u32 nr_enqueued, i;
5020+
struct rq *rq;
5021+
5022+
if (!scx_kf_allowed(SCX_KF_CPU_RELEASE))
5023+
return 0;
5024+
5025+
rq = cpu_rq(smp_processor_id());
5026+
lockdep_assert_rq_held(rq);
5027+
5028+
/*
5029+
* Get the number of tasks on the local DSQ before iterating over it to
5030+
* pull off tasks. The enqueue callback below can signal that it wants
5031+
* the task to stay on the local DSQ, and we want to prevent the BPF
5032+
* scheduler from causing us to loop indefinitely.
5033+
*/
5034+
nr_enqueued = rq->scx.local_dsq.nr;
5035+
for (i = 0; i < nr_enqueued; i++) {
5036+
struct task_struct *p;
5037+
5038+
p = first_local_task(rq);
5039+
WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) !=
5040+
SCX_OPSS_NONE);
5041+
WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED));
5042+
WARN_ON_ONCE(p->scx.holding_cpu != -1);
5043+
dispatch_dequeue(rq, p);
5044+
do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1);
5045+
}
5046+
5047+
return nr_enqueued;
5048+
}
5049+
5050+
__bpf_kfunc_end_defs();
5051+
5052+
BTF_KFUNCS_START(scx_kfunc_ids_cpu_release)
5053+
BTF_ID_FLAGS(func, scx_bpf_reenqueue_local)
5054+
BTF_KFUNCS_END(scx_kfunc_ids_cpu_release)
5055+
5056+
static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = {
5057+
.owner = THIS_MODULE,
5058+
.set = &scx_kfunc_ids_cpu_release,
5059+
};
5060+
5061+
__bpf_kfunc_start_defs();
5062+
48735063
/**
48745064
* scx_bpf_kick_cpu - Trigger reschedule on a CPU
48755065
* @cpu: cpu to kick
@@ -5379,6 +5569,8 @@ static int __init scx_init(void)
53795569
&scx_kfunc_set_enqueue_dispatch)) ||
53805570
(ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
53815571
&scx_kfunc_set_dispatch)) ||
5572+
(ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
5573+
&scx_kfunc_set_cpu_release)) ||
53825574
(ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
53835575
&scx_kfunc_set_any)) ||
53845576
(ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,

kernel/sched/ext.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ DECLARE_STATIC_KEY_FALSE(__scx_switched_all);
2424
#define scx_enabled() static_branch_unlikely(&__scx_ops_enabled)
2525
#define scx_switched_all() static_branch_unlikely(&__scx_switched_all)
2626

27+
DECLARE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
28+
2729
static inline bool task_on_scx(const struct task_struct *p)
2830
{
2931
return scx_enabled() && p->sched_class == &ext_sched_class;

kernel/sched/sched.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -737,6 +737,7 @@ struct scx_rq {
737737
u64 extra_enq_flags; /* see move_task_to_local_dsq() */
738738
u32 nr_running;
739739
u32 flags;
740+
bool cpu_released;
740741
cpumask_var_t cpus_to_kick;
741742
cpumask_var_t cpus_to_kick_if_idle;
742743
cpumask_var_t cpus_to_preempt;

tools/sched_ext/include/scx/common.bpf.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flag
3434
u32 scx_bpf_dispatch_nr_slots(void) __ksym;
3535
void scx_bpf_dispatch_cancel(void) __ksym;
3636
bool scx_bpf_consume(u64 dsq_id) __ksym;
37+
u32 scx_bpf_reenqueue_local(void) __ksym;
3738
void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
3839
s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
3940
void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;

0 commit comments

Comments
 (0)