Skip to content

Commit 90e5516

Browse files
Byte-Labhtejun
authored andcommitted
sched_ext: Implement SCX_KICK_WAIT
If set when calling scx_bpf_kick_cpu(), the invoking CPU will busy wait for the kicked cpu to enter the scheduler. See the following for example usage: https://github.com/sched-ext/scx/blob/main/scheds/c/scx_pair.bpf.c v2: - Updated to fit the updated kick_cpus_irq_workfn() implementation. - Include SCX_KICK_WAIT related information in debug dump. Signed-off-by: David Vernet <dvernet@meta.com> Reviewed-by: Tejun Heo <tj@kernel.org> Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Josh Don <joshdon@google.com> Acked-by: Hao Luo <haoluo@google.com> Acked-by: Barret Rhoden <brho@google.com>
1 parent 3645402 commit 90e5516

File tree

4 files changed

+85
-7
lines changed

4 files changed

+85
-7
lines changed

kernel/sched/core.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5898,8 +5898,10 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
58985898

58995899
for_each_active_class(class) {
59005900
p = class->pick_next_task(rq);
5901-
if (p)
5901+
if (p) {
5902+
scx_next_task_picked(rq, p, class);
59025903
return p;
5904+
}
59035905
}
59045906

59055907
BUG(); /* The idle class should always have a runnable task. */

kernel/sched/ext.c

Lines changed: 76 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -532,6 +532,12 @@ enum scx_kick_flags {
532532
* task expires and the dispatch path is invoked.
533533
*/
534534
SCX_KICK_PREEMPT = 1LLU << 1,
535+
536+
/*
537+
* Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will
538+
* return after the target CPU finishes picking the next task.
539+
*/
540+
SCX_KICK_WAIT = 1LLU << 2,
535541
};
536542

537543
enum scx_ops_enable_state {
@@ -661,6 +667,9 @@ static struct {
661667

662668
#endif /* CONFIG_SMP */
663669

670+
/* for %SCX_KICK_WAIT */
671+
static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
672+
664673
/*
665674
* Direct dispatch marker.
666675
*
@@ -2288,6 +2297,23 @@ static struct task_struct *pick_next_task_scx(struct rq *rq)
22882297
return p;
22892298
}
22902299

2300+
void scx_next_task_picked(struct rq *rq, struct task_struct *p,
2301+
const struct sched_class *active)
2302+
{
2303+
lockdep_assert_rq_held(rq);
2304+
2305+
if (!scx_enabled())
2306+
return;
2307+
#ifdef CONFIG_SMP
2308+
/*
2309+
* Pairs with the smp_load_acquire() issued by a CPU in
2310+
* kick_cpus_irq_workfn() who is waiting for this CPU to perform a
2311+
* resched.
2312+
*/
2313+
smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);
2314+
#endif
2315+
}
2316+
22912317
#ifdef CONFIG_SMP
22922318

22932319
static bool test_and_clear_cpu_idle(int cpu)
@@ -3673,9 +3699,9 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
36733699
seq_buf_init(&ns, buf, avail);
36743700

36753701
dump_newline(&ns);
3676-
dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x ops_qseq=%lu",
3702+
dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x ops_qseq=%lu pnt_seq=%lu",
36773703
cpu, rq->scx.nr_running, rq->scx.flags,
3678-
rq->scx.ops_qseq);
3704+
rq->scx.ops_qseq, rq->scx.pnt_seq);
36793705
dump_line(&ns, " curr=%s[%d] class=%ps",
36803706
rq->curr->comm, rq->curr->pid,
36813707
rq->curr->sched_class);
@@ -3688,6 +3714,9 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
36883714
if (!cpumask_empty(rq->scx.cpus_to_preempt))
36893715
dump_line(&ns, " cpus_to_preempt: %*pb",
36903716
cpumask_pr_args(rq->scx.cpus_to_preempt));
3717+
if (!cpumask_empty(rq->scx.cpus_to_wait))
3718+
dump_line(&ns, " cpus_to_wait : %*pb",
3719+
cpumask_pr_args(rq->scx.cpus_to_wait));
36913720

36923721
used = seq_buf_used(&ns);
36933722
if (SCX_HAS_OP(dump_cpu)) {
@@ -4383,10 +4412,11 @@ static bool can_skip_idle_kick(struct rq *rq)
43834412
return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_BALANCING);
43844413
}
43854414

4386-
static void kick_one_cpu(s32 cpu, struct rq *this_rq)
4415+
static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *pseqs)
43874416
{
43884417
struct rq *rq = cpu_rq(cpu);
43894418
struct scx_rq *this_scx = &this_rq->scx;
4419+
bool should_wait = false;
43904420
unsigned long flags;
43914421

43924422
raw_spin_rq_lock_irqsave(rq, flags);
@@ -4402,12 +4432,20 @@ static void kick_one_cpu(s32 cpu, struct rq *this_rq)
44024432
cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt);
44034433
}
44044434

4435+
if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) {
4436+
pseqs[cpu] = rq->scx.pnt_seq;
4437+
should_wait = true;
4438+
}
4439+
44054440
resched_curr(rq);
44064441
} else {
44074442
cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt);
4443+
cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
44084444
}
44094445

44104446
raw_spin_rq_unlock_irqrestore(rq, flags);
4447+
4448+
return should_wait;
44114449
}
44124450

44134451
static void kick_one_cpu_if_idle(s32 cpu, struct rq *this_rq)
@@ -4428,10 +4466,12 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
44284466
{
44294467
struct rq *this_rq = this_rq();
44304468
struct scx_rq *this_scx = &this_rq->scx;
4469+
unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs);
4470+
bool should_wait = false;
44314471
s32 cpu;
44324472

44334473
for_each_cpu(cpu, this_scx->cpus_to_kick) {
4434-
kick_one_cpu(cpu, this_rq);
4474+
should_wait |= kick_one_cpu(cpu, this_rq, pseqs);
44354475
cpumask_clear_cpu(cpu, this_scx->cpus_to_kick);
44364476
cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
44374477
}
@@ -4440,6 +4480,28 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
44404480
kick_one_cpu_if_idle(cpu, this_rq);
44414481
cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
44424482
}
4483+
4484+
if (!should_wait)
4485+
return;
4486+
4487+
for_each_cpu(cpu, this_scx->cpus_to_wait) {
4488+
unsigned long *wait_pnt_seq = &cpu_rq(cpu)->scx.pnt_seq;
4489+
4490+
if (cpu != cpu_of(this_rq)) {
4491+
/*
4492+
* Pairs with smp_store_release() issued by this CPU in
4493+
* scx_next_task_picked() on the resched path.
4494+
*
4495+
* We busy-wait here to guarantee that no other task can
4496+
* be scheduled on our core before the target CPU has
4497+
* entered the resched path.
4498+
*/
4499+
while (smp_load_acquire(wait_pnt_seq) == pseqs[cpu])
4500+
cpu_relax();
4501+
}
4502+
4503+
cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
4504+
}
44434505
}
44444506

44454507
/**
@@ -4504,6 +4566,11 @@ void __init init_sched_ext_class(void)
45044566
BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL));
45054567
BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL));
45064568
#endif
4569+
scx_kick_cpus_pnt_seqs =
4570+
__alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids,
4571+
__alignof__(scx_kick_cpus_pnt_seqs[0]));
4572+
BUG_ON(!scx_kick_cpus_pnt_seqs);
4573+
45074574
for_each_possible_cpu(cpu) {
45084575
struct rq *rq = cpu_rq(cpu);
45094576

@@ -4513,6 +4580,7 @@ void __init init_sched_ext_class(void)
45134580
BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL));
45144581
BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL));
45154582
BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL));
4583+
BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL));
45164584
init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn);
45174585
}
45184586

@@ -4840,8 +4908,8 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
48404908
if (flags & SCX_KICK_IDLE) {
48414909
struct rq *target_rq = cpu_rq(cpu);
48424910

4843-
if (unlikely(flags & SCX_KICK_PREEMPT))
4844-
scx_ops_error("PREEMPT cannot be used with SCX_KICK_IDLE");
4911+
if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT)))
4912+
scx_ops_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE");
48454913

48464914
if (raw_spin_rq_trylock(target_rq)) {
48474915
if (can_skip_idle_kick(target_rq)) {
@@ -4856,6 +4924,8 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
48564924

48574925
if (flags & SCX_KICK_PREEMPT)
48584926
cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt);
4927+
if (flags & SCX_KICK_WAIT)
4928+
cpumask_set_cpu(cpu, this_rq->scx.cpus_to_wait);
48594929
}
48604930

48614931
irq_work_queue(&this_rq->scx.kick_cpus_irq_work);

kernel/sched/ext.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ static inline bool task_on_scx(const struct task_struct *p)
2929
return scx_enabled() && p->sched_class == &ext_sched_class;
3030
}
3131

32+
void scx_next_task_picked(struct rq *rq, struct task_struct *p,
33+
const struct sched_class *active);
3234
void scx_tick(struct rq *rq);
3335
void init_scx_entity(struct sched_ext_entity *scx);
3436
void scx_pre_fork(struct task_struct *p);
@@ -69,6 +71,8 @@ static inline const struct sched_class *next_active_class(const struct sched_cla
6971
#define scx_enabled() false
7072
#define scx_switched_all() false
7173

74+
static inline void scx_next_task_picked(struct rq *rq, struct task_struct *p,
75+
const struct sched_class *active) {}
7276
static inline void scx_tick(struct rq *rq) {}
7377
static inline void scx_pre_fork(struct task_struct *p) {}
7478
static inline int scx_fork(struct task_struct *p) { return 0; }

kernel/sched/sched.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -740,6 +740,8 @@ struct scx_rq {
740740
cpumask_var_t cpus_to_kick;
741741
cpumask_var_t cpus_to_kick_if_idle;
742742
cpumask_var_t cpus_to_preempt;
743+
cpumask_var_t cpus_to_wait;
744+
unsigned long pnt_seq;
743745
struct irq_work kick_cpus_irq_work;
744746
};
745747
#endif /* CONFIG_SCHED_CLASS_EXT */

0 commit comments

Comments
 (0)