@@ -110,6 +110,32 @@ struct scx_exit_task_args {
110110 bool cancelled ;
111111};
112112
113+ enum scx_cpu_preempt_reason {
114+ /* next task is being scheduled by &sched_class_rt */
115+ SCX_CPU_PREEMPT_RT ,
116+ /* next task is being scheduled by &sched_class_dl */
117+ SCX_CPU_PREEMPT_DL ,
118+ /* next task is being scheduled by &sched_class_stop */
119+ SCX_CPU_PREEMPT_STOP ,
120+ /* unknown reason for SCX being preempted */
121+ SCX_CPU_PREEMPT_UNKNOWN ,
122+ };
123+
124+ /*
125+ * Argument container for ops->cpu_acquire(). Currently empty, but may be
126+ * expanded in the future.
127+ */
128+ struct scx_cpu_acquire_args {};
129+
130+ /* argument container for ops->cpu_release() */
131+ struct scx_cpu_release_args {
132+ /* the reason the CPU was preempted */
133+ enum scx_cpu_preempt_reason reason ;
134+
135+ /* the task that's going to be scheduled on the CPU */
136+ struct task_struct * task ;
137+ };
138+
113139/*
114140 * Informational context provided to dump operations.
115141 */
@@ -335,6 +361,28 @@ struct sched_ext_ops {
335361 */
336362 void (* update_idle )(s32 cpu , bool idle );
337363
364+ /**
365+ * cpu_acquire - A CPU is becoming available to the BPF scheduler
366+ * @cpu: The CPU being acquired by the BPF scheduler.
367+ * @args: Acquire arguments, see the struct definition.
368+ *
369+ * A CPU that was previously released from the BPF scheduler is now once
370+ * again under its control.
371+ */
372+ void (* cpu_acquire )(s32 cpu , struct scx_cpu_acquire_args * args );
373+
374+ /**
375+ * cpu_release - A CPU is taken away from the BPF scheduler
376+ * @cpu: The CPU being released by the BPF scheduler.
377+ * @args: Release arguments, see the struct definition.
378+ *
379+ * The specified CPU is no longer under the control of the BPF
380+ * scheduler. This could be because it was preempted by a higher
381+ * priority sched_class, though there may be other reasons as well. The
382+ * caller should consult @args->reason to determine the cause.
383+ */
384+ void (* cpu_release )(s32 cpu , struct scx_cpu_release_args * args );
385+
338386 /**
339387 * init_task - Initialize a task to run in a BPF scheduler
340388 * @p: task to initialize for BPF scheduling
@@ -487,6 +535,17 @@ enum scx_enq_flags {
487535 */
488536 SCX_ENQ_PREEMPT = 1LLU << 32 ,
489537
538+ /*
539+ * The task being enqueued was previously enqueued on the current CPU's
540+ * %SCX_DSQ_LOCAL, but was removed from it in a call to the
541+ * bpf_scx_reenqueue_local() kfunc. If bpf_scx_reenqueue_local() was
542+ * invoked in a ->cpu_release() callback, and the task is again
543+ * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the
544+ * task will not be scheduled on the CPU until at least the next invocation
545+ * of the ->cpu_acquire() callback.
546+ */
547+ SCX_ENQ_REENQ = 1LLU << 40 ,
548+
490549 /*
491550 * The task being enqueued is the only task available for the cpu. By
492551 * default, ext core keeps executing such tasks but when
@@ -625,6 +684,7 @@ static bool scx_warned_zero_slice;
625684
626685static DEFINE_STATIC_KEY_FALSE (scx_ops_enq_last );
627686static DEFINE_STATIC_KEY_FALSE (scx_ops_enq_exiting );
687+ DEFINE_STATIC_KEY_FALSE (scx_ops_cpu_preempt );
628688static DEFINE_STATIC_KEY_FALSE (scx_builtin_idle_enabled );
629689
630690struct static_key_false scx_has_op [SCX_OPI_END ] =
@@ -887,6 +947,12 @@ static __always_inline bool scx_kf_allowed(u32 mask)
887947 * inside ops.dispatch(). We don't need to check the SCX_KF_SLEEPABLE
888948 * boundary thanks to the above in_interrupt() check.
889949 */
950+ if (unlikely (highest_bit (mask ) == SCX_KF_CPU_RELEASE &&
951+ (current -> scx .kf_mask & higher_bits (SCX_KF_CPU_RELEASE )))) {
952+ scx_ops_error ("cpu_release kfunc called from a nested operation" );
953+ return false;
954+ }
955+
890956 if (unlikely (highest_bit (mask ) == SCX_KF_DISPATCH &&
891957 (current -> scx .kf_mask & higher_bits (SCX_KF_DISPATCH )))) {
892958 scx_ops_error ("dispatch kfunc called from a nested operation" );
@@ -2070,14 +2136,29 @@ static int balance_scx(struct rq *rq, struct task_struct *prev,
20702136 lockdep_assert_rq_held (rq );
20712137 rq -> scx .flags |= SCX_RQ_BALANCING ;
20722138
2139+ if (static_branch_unlikely (& scx_ops_cpu_preempt ) &&
2140+ unlikely (rq -> scx .cpu_released )) {
2141+ /*
2142+ * If the previous sched_class for the current CPU was not SCX,
2143+ * notify the BPF scheduler that it again has control of the
2144+ * core. This callback complements ->cpu_release(), which is
2145+ * emitted in scx_next_task_picked().
2146+ */
2147+ if (SCX_HAS_OP (cpu_acquire ))
2148+ SCX_CALL_OP (0 , cpu_acquire , cpu_of (rq ), NULL );
2149+ rq -> scx .cpu_released = false;
2150+ }
2151+
20732152 if (prev_on_scx ) {
20742153 WARN_ON_ONCE (prev -> scx .flags & SCX_TASK_BAL_KEEP );
20752154 update_curr_scx (rq );
20762155
20772156 /*
20782157 * If @prev is runnable & has slice left, it has priority and
20792158 * fetching more just increases latency for the fetched tasks.
2080- * Tell put_prev_task_scx() to put @prev on local_dsq.
2159+ * Tell put_prev_task_scx() to put @prev on local_dsq. If the
2160+ * BPF scheduler wants to handle this explicitly, it should
2161+ * implement ->cpu_released().
20812162 *
20822163 * See scx_ops_disable_workfn() for the explanation on the
20832164 * bypassing test.
@@ -2297,6 +2378,20 @@ static struct task_struct *pick_next_task_scx(struct rq *rq)
22972378 return p ;
22982379}
22992380
2381+ static enum scx_cpu_preempt_reason
2382+ preempt_reason_from_class (const struct sched_class * class )
2383+ {
2384+ #ifdef CONFIG_SMP
2385+ if (class == & stop_sched_class )
2386+ return SCX_CPU_PREEMPT_STOP ;
2387+ #endif
2388+ if (class == & dl_sched_class )
2389+ return SCX_CPU_PREEMPT_DL ;
2390+ if (class == & rt_sched_class )
2391+ return SCX_CPU_PREEMPT_RT ;
2392+ return SCX_CPU_PREEMPT_UNKNOWN ;
2393+ }
2394+
23002395void scx_next_task_picked (struct rq * rq , struct task_struct * p ,
23012396 const struct sched_class * active )
23022397{
@@ -2312,6 +2407,40 @@ void scx_next_task_picked(struct rq *rq, struct task_struct *p,
23122407 */
23132408 smp_store_release (& rq -> scx .pnt_seq , rq -> scx .pnt_seq + 1 );
23142409#endif
2410+ if (!static_branch_unlikely (& scx_ops_cpu_preempt ))
2411+ return ;
2412+
2413+ /*
2414+ * The callback is conceptually meant to convey that the CPU is no
2415+ * longer under the control of SCX. Therefore, don't invoke the
2416+ * callback if the CPU is is staying on SCX, or going idle (in which
2417+ * case the SCX scheduler has actively decided not to schedule any
2418+ * tasks on the CPU).
2419+ */
2420+ if (likely (active >= & ext_sched_class ))
2421+ return ;
2422+
2423+ /*
2424+ * At this point we know that SCX was preempted by a higher priority
2425+ * sched_class, so invoke the ->cpu_release() callback if we have not
2426+ * done so already. We only send the callback once between SCX being
2427+ * preempted, and it regaining control of the CPU.
2428+ *
2429+ * ->cpu_release() complements ->cpu_acquire(), which is emitted the
2430+ * next time that balance_scx() is invoked.
2431+ */
2432+ if (!rq -> scx .cpu_released ) {
2433+ if (SCX_HAS_OP (cpu_release )) {
2434+ struct scx_cpu_release_args args = {
2435+ .reason = preempt_reason_from_class (active ),
2436+ .task = p ,
2437+ };
2438+
2439+ SCX_CALL_OP (SCX_KF_CPU_RELEASE ,
2440+ cpu_release , cpu_of (rq ), & args );
2441+ }
2442+ rq -> scx .cpu_released = true;
2443+ }
23152444}
23162445
23172446#ifdef CONFIG_SMP
@@ -3398,6 +3527,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
33983527 static_branch_disable_cpuslocked (& scx_has_op [i ]);
33993528 static_branch_disable_cpuslocked (& scx_ops_enq_last );
34003529 static_branch_disable_cpuslocked (& scx_ops_enq_exiting );
3530+ static_branch_disable_cpuslocked (& scx_ops_cpu_preempt );
34013531 static_branch_disable_cpuslocked (& scx_builtin_idle_enabled );
34023532 synchronize_rcu ();
34033533
@@ -3699,9 +3829,10 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
36993829 seq_buf_init (& ns , buf , avail );
37003830
37013831 dump_newline (& ns );
3702- dump_line (& ns , "CPU %-4d: nr_run=%u flags=0x%x ops_qseq=%lu pnt_seq=%lu" ,
3832+ dump_line (& ns , "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu pnt_seq=%lu" ,
37033833 cpu , rq -> scx .nr_running , rq -> scx .flags ,
3704- rq -> scx .ops_qseq , rq -> scx .pnt_seq );
3834+ rq -> scx .cpu_released , rq -> scx .ops_qseq ,
3835+ rq -> scx .pnt_seq );
37053836 dump_line (& ns , " curr=%s[%d] class=%ps" ,
37063837 rq -> curr -> comm , rq -> curr -> pid ,
37073838 rq -> curr -> sched_class );
@@ -3942,6 +4073,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
39424073
39434074 if (ops -> flags & SCX_OPS_ENQ_EXITING )
39444075 static_branch_enable_cpuslocked (& scx_ops_enq_exiting );
4076+ if (scx_ops .cpu_acquire || scx_ops .cpu_release )
4077+ static_branch_enable_cpuslocked (& scx_ops_cpu_preempt );
39454078
39464079 if (!ops -> update_idle || (ops -> flags & SCX_OPS_KEEP_BUILTIN_IDLE )) {
39474080 reset_idle_masks ();
@@ -4318,6 +4451,8 @@ static bool yield_stub(struct task_struct *from, struct task_struct *to) { retur
43184451static void set_weight_stub (struct task_struct * p , u32 weight ) {}
43194452static void set_cpumask_stub (struct task_struct * p , const struct cpumask * mask ) {}
43204453static void update_idle_stub (s32 cpu , bool idle ) {}
4454+ static void cpu_acquire_stub (s32 cpu , struct scx_cpu_acquire_args * args ) {}
4455+ static void cpu_release_stub (s32 cpu , struct scx_cpu_release_args * args ) {}
43214456static s32 init_task_stub (struct task_struct * p , struct scx_init_task_args * args ) { return - EINVAL ; }
43224457static void exit_task_stub (struct task_struct * p , struct scx_exit_task_args * args ) {}
43234458static void enable_stub (struct task_struct * p ) {}
@@ -4338,6 +4473,8 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
43384473 .set_weight = set_weight_stub ,
43394474 .set_cpumask = set_cpumask_stub ,
43404475 .update_idle = update_idle_stub ,
4476+ .cpu_acquire = cpu_acquire_stub ,
4477+ .cpu_release = cpu_release_stub ,
43414478 .init_task = init_task_stub ,
43424479 .exit_task = exit_task_stub ,
43434480 .enable = enable_stub ,
@@ -4870,6 +5007,59 @@ static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
48705007
48715008__bpf_kfunc_start_defs ();
48725009
5010+ /**
5011+ * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
5012+ *
5013+ * Iterate over all of the tasks currently enqueued on the local DSQ of the
5014+ * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of
5015+ * processed tasks. Can only be called from ops.cpu_release().
5016+ */
5017+ __bpf_kfunc u32 scx_bpf_reenqueue_local (void )
5018+ {
5019+ u32 nr_enqueued , i ;
5020+ struct rq * rq ;
5021+
5022+ if (!scx_kf_allowed (SCX_KF_CPU_RELEASE ))
5023+ return 0 ;
5024+
5025+ rq = cpu_rq (smp_processor_id ());
5026+ lockdep_assert_rq_held (rq );
5027+
5028+ /*
5029+ * Get the number of tasks on the local DSQ before iterating over it to
5030+ * pull off tasks. The enqueue callback below can signal that it wants
5031+ * the task to stay on the local DSQ, and we want to prevent the BPF
5032+ * scheduler from causing us to loop indefinitely.
5033+ */
5034+ nr_enqueued = rq -> scx .local_dsq .nr ;
5035+ for (i = 0 ; i < nr_enqueued ; i ++ ) {
5036+ struct task_struct * p ;
5037+
5038+ p = first_local_task (rq );
5039+ WARN_ON_ONCE (atomic_long_read (& p -> scx .ops_state ) !=
5040+ SCX_OPSS_NONE );
5041+ WARN_ON_ONCE (!(p -> scx .flags & SCX_TASK_QUEUED ));
5042+ WARN_ON_ONCE (p -> scx .holding_cpu != -1 );
5043+ dispatch_dequeue (rq , p );
5044+ do_enqueue_task (rq , p , SCX_ENQ_REENQ , -1 );
5045+ }
5046+
5047+ return nr_enqueued ;
5048+ }
5049+
5050+ __bpf_kfunc_end_defs ();
5051+
5052+ BTF_KFUNCS_START (scx_kfunc_ids_cpu_release )
5053+ BTF_ID_FLAGS (func , scx_bpf_reenqueue_local )
5054+ BTF_KFUNCS_END (scx_kfunc_ids_cpu_release )
5055+
5056+ static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = {
5057+ .owner = THIS_MODULE ,
5058+ .set = & scx_kfunc_ids_cpu_release ,
5059+ };
5060+
5061+ __bpf_kfunc_start_defs ();
5062+
48735063/**
48745064 * scx_bpf_kick_cpu - Trigger reschedule on a CPU
48755065 * @cpu: cpu to kick
@@ -5379,6 +5569,8 @@ static int __init scx_init(void)
53795569 & scx_kfunc_set_enqueue_dispatch )) ||
53805570 (ret = register_btf_kfunc_id_set (BPF_PROG_TYPE_STRUCT_OPS ,
53815571 & scx_kfunc_set_dispatch )) ||
5572+ (ret = register_btf_kfunc_id_set (BPF_PROG_TYPE_STRUCT_OPS ,
5573+ & scx_kfunc_set_cpu_release )) ||
53825574 (ret = register_btf_kfunc_id_set (BPF_PROG_TYPE_STRUCT_OPS ,
53835575 & scx_kfunc_set_any )) ||
53845576 (ret = register_btf_kfunc_id_set (BPF_PROG_TYPE_TRACING ,
0 commit comments