Skip to content

Commit 60c27fb

Browse files
committed
sched_ext: Implement sched_ext_ops.cpu_online/offline()
Add ops.cpu_online/offline() which are invoked when CPUs come online and offline respectively. As the enqueue path already automatically bypasses tasks to the local dsq on a deactivated CPU, BPF schedulers are guaranteed to see tasks only on CPUs which are between online() and offline(). If the BPF scheduler doesn't implement ops.cpu_online/offline(), the scheduler is automatically exited with SCX_ECODE_RESTART | SCX_ECODE_RSN_HOTPLUG. Userspace can implement CPU hotpplug support trivially by simply reinitializing and reloading the scheduler. scx_qmap is updated to print out online CPUs on hotplug events. Other schedulers are updated to restart based on ecode. v3: - The previous implementation added @Reason to sched_class.rq_on/offline() to distinguish between CPU hotplug events and topology updates. This was buggy and fragile as the methods are skipped if the current state equals the target state. Instead, add scx_rq_[de]activate() which are directly called from sched_cpu_de/activate(). This also allows ops.cpu_on/offline() to sleep which can be useful. - ops.dispatch() could be called on a CPU that the BPF scheduler was told to be offline. The dispatch patch is updated to bypass in such cases. v2: - To accommodate lock ordering change between scx_cgroup_rwsem and cpus_read_lock(), CPU hotplug operations are put into its own SCX_OPI block and enabled eariler during scx_ope_enable() so that cpus_read_lock() can be dropped before acquiring scx_cgroup_rwsem. - Auto exit with ECODE added. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: David Vernet <dvernet@meta.com> Acked-by: Josh Don <joshdon@google.com> Acked-by: Hao Luo <haoluo@google.com> Acked-by: Barret Rhoden <brho@google.com>
1 parent 245254f commit 60c27fb

File tree

10 files changed

+290
-12
lines changed

10 files changed

+290
-12
lines changed

kernel/sched/core.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7984,6 +7984,8 @@ int sched_cpu_activate(unsigned int cpu)
79847984
cpuset_cpu_active();
79857985
}
79867986

7987+
scx_rq_activate(rq);
7988+
79877989
/*
79887990
* Put the rq online, if not already. This happens:
79897991
*
@@ -8044,6 +8046,8 @@ int sched_cpu_deactivate(unsigned int cpu)
80448046
}
80458047
rq_unlock_irqrestore(rq, &rf);
80468048

8049+
scx_rq_deactivate(rq);
8050+
80478051
#ifdef CONFIG_SCHED_SMT
80488052
/*
80498053
* When going down, decrement the number of cores with SMT present.

kernel/sched/ext.c

Lines changed: 149 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,29 @@ enum scx_exit_kind {
3030
SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */
3131
};
3232

33+
/*
34+
* An exit code can be specified when exiting with scx_bpf_exit() or
35+
* scx_ops_exit(), corresponding to exit_kind UNREG_BPF and UNREG_KERN
36+
* respectively. The codes are 64bit of the format:
37+
*
38+
* Bits: [63 .. 48 47 .. 32 31 .. 0]
39+
* [ SYS ACT ] [ SYS RSN ] [ USR ]
40+
*
41+
* SYS ACT: System-defined exit actions
42+
* SYS RSN: System-defined exit reasons
43+
* USR : User-defined exit codes and reasons
44+
*
45+
* Using the above, users may communicate intention and context by ORing system
46+
* actions and/or system reasons with a user-defined exit code.
47+
*/
48+
enum scx_exit_code {
49+
/* Reasons */
50+
SCX_ECODE_RSN_HOTPLUG = 1LLU << 32,
51+
52+
/* Actions */
53+
SCX_ECODE_ACT_RESTART = 1LLU << 48,
54+
};
55+
3356
/*
3457
* scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is
3558
* being disabled.
@@ -457,7 +480,29 @@ struct sched_ext_ops {
457480
void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p);
458481

459482
/*
460-
* All online ops must come before ops.init().
483+
* All online ops must come before ops.cpu_online().
484+
*/
485+
486+
/**
487+
* cpu_online - A CPU became online
488+
* @cpu: CPU which just came up
489+
*
490+
* @cpu just came online. @cpu will not call ops.enqueue() or
491+
* ops.dispatch(), nor run tasks associated with other CPUs beforehand.
492+
*/
493+
void (*cpu_online)(s32 cpu);
494+
495+
/**
496+
* cpu_offline - A CPU is going offline
497+
* @cpu: CPU which is going offline
498+
*
499+
* @cpu is going offline. @cpu will not call ops.enqueue() or
500+
* ops.dispatch(), nor run tasks associated with other CPUs afterwards.
501+
*/
502+
void (*cpu_offline)(s32 cpu);
503+
504+
/*
505+
* All CPU hotplug ops must come before ops.init().
461506
*/
462507

463508
/**
@@ -496,6 +541,15 @@ struct sched_ext_ops {
496541
*/
497542
u32 exit_dump_len;
498543

544+
/**
545+
* hotplug_seq - A sequence number that may be set by the scheduler to
546+
* detect when a hotplug event has occurred during the loading process.
547+
* If 0, no detection occurs. Otherwise, the scheduler will fail to
548+
* load if the sequence number does not match @scx_hotplug_seq on the
549+
* enable path.
550+
*/
551+
u64 hotplug_seq;
552+
499553
/**
500554
* name - BPF scheduler's name
501555
*
@@ -509,7 +563,9 @@ struct sched_ext_ops {
509563
enum scx_opi {
510564
SCX_OPI_BEGIN = 0,
511565
SCX_OPI_NORMAL_BEGIN = 0,
512-
SCX_OPI_NORMAL_END = SCX_OP_IDX(init),
566+
SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online),
567+
SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online),
568+
SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init),
513569
SCX_OPI_END = SCX_OP_IDX(init),
514570
};
515571

@@ -694,6 +750,7 @@ static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE);
694750
static struct scx_exit_info *scx_exit_info;
695751

696752
static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
753+
static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
697754

698755
/*
699756
* The maximum amount of time in jiffies that a task may be runnable without
@@ -1419,11 +1476,7 @@ static void direct_dispatch(struct task_struct *p, u64 enq_flags)
14191476

14201477
static bool scx_rq_online(struct rq *rq)
14211478
{
1422-
#ifdef CONFIG_SMP
1423-
return likely(rq->online);
1424-
#else
1425-
return true;
1426-
#endif
1479+
return likely(rq->scx.flags & SCX_RQ_ONLINE);
14271480
}
14281481

14291482
static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
@@ -1438,6 +1491,11 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
14381491
if (sticky_cpu == cpu_of(rq))
14391492
goto local_norefill;
14401493

1494+
/*
1495+
* If !scx_rq_online(), we already told the BPF scheduler that the CPU
1496+
* is offline and are just running the hotplug path. Don't bother the
1497+
* BPF scheduler.
1498+
*/
14411499
if (!scx_rq_online(rq))
14421500
goto local;
14431501

@@ -2673,6 +2731,42 @@ void __scx_update_idle(struct rq *rq, bool idle)
26732731
#endif
26742732
}
26752733

2734+
static void handle_hotplug(struct rq *rq, bool online)
2735+
{
2736+
int cpu = cpu_of(rq);
2737+
2738+
atomic_long_inc(&scx_hotplug_seq);
2739+
2740+
if (online && SCX_HAS_OP(cpu_online))
2741+
SCX_CALL_OP(SCX_KF_SLEEPABLE, cpu_online, cpu);
2742+
else if (!online && SCX_HAS_OP(cpu_offline))
2743+
SCX_CALL_OP(SCX_KF_SLEEPABLE, cpu_offline, cpu);
2744+
else
2745+
scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
2746+
"cpu %d going %s, exiting scheduler", cpu,
2747+
online ? "online" : "offline");
2748+
}
2749+
2750+
void scx_rq_activate(struct rq *rq)
2751+
{
2752+
handle_hotplug(rq, true);
2753+
}
2754+
2755+
void scx_rq_deactivate(struct rq *rq)
2756+
{
2757+
handle_hotplug(rq, false);
2758+
}
2759+
2760+
static void rq_online_scx(struct rq *rq)
2761+
{
2762+
rq->scx.flags |= SCX_RQ_ONLINE;
2763+
}
2764+
2765+
static void rq_offline_scx(struct rq *rq)
2766+
{
2767+
rq->scx.flags &= ~SCX_RQ_ONLINE;
2768+
}
2769+
26762770
#else /* CONFIG_SMP */
26772771

26782772
static bool test_and_clear_cpu_idle(int cpu) { return false; }
@@ -3104,6 +3198,9 @@ DEFINE_SCHED_CLASS(ext) = {
31043198
.balance = balance_scx,
31053199
.select_task_rq = select_task_rq_scx,
31063200
.set_cpus_allowed = set_cpus_allowed_scx,
3201+
3202+
.rq_online = rq_online_scx,
3203+
.rq_offline = rq_offline_scx,
31073204
#endif
31083205

31093206
.task_tick = task_tick_scx,
@@ -3235,10 +3332,18 @@ static ssize_t scx_attr_nr_rejected_show(struct kobject *kobj,
32353332
}
32363333
SCX_ATTR(nr_rejected);
32373334

3335+
static ssize_t scx_attr_hotplug_seq_show(struct kobject *kobj,
3336+
struct kobj_attribute *ka, char *buf)
3337+
{
3338+
return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_hotplug_seq));
3339+
}
3340+
SCX_ATTR(hotplug_seq);
3341+
32383342
static struct attribute *scx_global_attrs[] = {
32393343
&scx_attr_state.attr,
32403344
&scx_attr_switch_all.attr,
32413345
&scx_attr_nr_rejected.attr,
3346+
&scx_attr_hotplug_seq.attr,
32423347
NULL,
32433348
};
32443349

@@ -3941,6 +4046,25 @@ static struct kthread_worker *scx_create_rt_helper(const char *name)
39414046
return helper;
39424047
}
39434048

4049+
static void check_hotplug_seq(const struct sched_ext_ops *ops)
4050+
{
4051+
unsigned long long global_hotplug_seq;
4052+
4053+
/*
4054+
* If a hotplug event has occurred between when a scheduler was
4055+
* initialized, and when we were able to attach, exit and notify user
4056+
* space about it.
4057+
*/
4058+
if (ops->hotplug_seq) {
4059+
global_hotplug_seq = atomic_long_read(&scx_hotplug_seq);
4060+
if (ops->hotplug_seq != global_hotplug_seq) {
4061+
scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
4062+
"expected hotplug seq %llu did not match actual %llu",
4063+
ops->hotplug_seq, global_hotplug_seq);
4064+
}
4065+
}
4066+
}
4067+
39444068
static int validate_ops(const struct sched_ext_ops *ops)
39454069
{
39464070
/*
@@ -4023,6 +4147,10 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
40234147
}
40244148
}
40254149

4150+
for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
4151+
if (((void (**)(void))ops)[i])
4152+
static_branch_enable_cpuslocked(&scx_has_op[i]);
4153+
40264154
cpus_read_unlock();
40274155

40284156
ret = validate_ops(ops);
@@ -4064,6 +4192,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
40644192
percpu_down_write(&scx_fork_rwsem);
40654193
cpus_read_lock();
40664194

4195+
check_hotplug_seq(ops);
4196+
40674197
for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
40684198
if (((void (**)(void))ops)[i])
40694199
static_branch_enable_cpuslocked(&scx_has_op[i]);
@@ -4374,6 +4504,9 @@ static int bpf_scx_init_member(const struct btf_type *t,
43744504
ops->exit_dump_len =
43754505
*(u32 *)(udata + moff) ?: SCX_EXIT_DUMP_DFL_LEN;
43764506
return 1;
4507+
case offsetof(struct sched_ext_ops, hotplug_seq):
4508+
ops->hotplug_seq = *(u64 *)(udata + moff);
4509+
return 1;
43774510
}
43784511

43794512
return 0;
@@ -4387,6 +4520,8 @@ static int bpf_scx_check_member(const struct btf_type *t,
43874520

43884521
switch (moff) {
43894522
case offsetof(struct sched_ext_ops, init_task):
4523+
case offsetof(struct sched_ext_ops, cpu_online):
4524+
case offsetof(struct sched_ext_ops, cpu_offline):
43904525
case offsetof(struct sched_ext_ops, init):
43914526
case offsetof(struct sched_ext_ops, exit):
43924527
break;
@@ -4457,6 +4592,8 @@ static s32 init_task_stub(struct task_struct *p, struct scx_init_task_args *args
44574592
static void exit_task_stub(struct task_struct *p, struct scx_exit_task_args *args) {}
44584593
static void enable_stub(struct task_struct *p) {}
44594594
static void disable_stub(struct task_struct *p) {}
4595+
static void cpu_online_stub(s32 cpu) {}
4596+
static void cpu_offline_stub(s32 cpu) {}
44604597
static s32 init_stub(void) { return -EINVAL; }
44614598
static void exit_stub(struct scx_exit_info *info) {}
44624599

@@ -4479,6 +4616,8 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
44794616
.exit_task = exit_task_stub,
44804617
.enable = enable_stub,
44814618
.disable = disable_stub,
4619+
.cpu_online = cpu_online_stub,
4620+
.cpu_offline = cpu_offline_stub,
44824621
.init = init_stub,
44834622
.exit = exit_stub,
44844623
};
@@ -4719,6 +4858,9 @@ void __init init_sched_ext_class(void)
47194858
BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL));
47204859
BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL));
47214860
init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn);
4861+
4862+
if (cpu_online(cpu))
4863+
cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE;
47224864
}
47234865

47244866
register_sysrq_key('S', &sysrq_sched_ext_reset_op);

kernel/sched/ext.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ int scx_fork(struct task_struct *p);
4040
void scx_post_fork(struct task_struct *p);
4141
void scx_cancel_fork(struct task_struct *p);
4242
bool scx_can_stop_tick(struct rq *rq);
43+
void scx_rq_activate(struct rq *rq);
44+
void scx_rq_deactivate(struct rq *rq);
4345
int scx_check_setscheduler(struct task_struct *p, int policy);
4446
bool task_should_scx(struct task_struct *p);
4547
void init_sched_ext_class(void);
@@ -81,6 +83,8 @@ static inline int scx_fork(struct task_struct *p) { return 0; }
8183
static inline void scx_post_fork(struct task_struct *p) {}
8284
static inline void scx_cancel_fork(struct task_struct *p) {}
8385
static inline bool scx_can_stop_tick(struct rq *rq) { return true; }
86+
static inline void scx_rq_activate(struct rq *rq) {}
87+
static inline void scx_rq_deactivate(struct rq *rq) {}
8488
static inline int scx_check_setscheduler(struct task_struct *p, int policy) { return 0; }
8589
static inline bool task_on_scx(const struct task_struct *p) { return false; }
8690
static inline void init_sched_ext_class(void) {}

kernel/sched/sched.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -726,6 +726,12 @@ struct cfs_rq {
726726
#ifdef CONFIG_SCHED_CLASS_EXT
727727
/* scx_rq->flags, protected by the rq lock */
728728
enum scx_rq_flags {
729+
/*
730+
* A hotplugged CPU starts scheduling before rq_online_scx(). Track
731+
* ops.cpu_on/offline() state so that ops.enqueue/dispatch() are called
732+
* only while the BPF scheduler considers the CPU to be online.
733+
*/
734+
SCX_RQ_ONLINE = 1 << 0,
729735
SCX_RQ_BALANCING = 1 << 1,
730736
SCX_RQ_CAN_STOP_TICK = 1 << 2,
731737
};

tools/sched_ext/include/scx/compat.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88
#define __SCX_COMPAT_H
99

1010
#include <bpf/btf.h>
11+
#include <fcntl.h>
12+
#include <stdlib.h>
13+
#include <unistd.h>
1114

1215
struct btf *__COMPAT_vmlinux_btf __attribute__((weak));
1316

@@ -106,6 +109,28 @@ static inline bool __COMPAT_struct_has_field(const char *type, const char *field
106109
#define SCX_OPS_SWITCH_PARTIAL \
107110
__COMPAT_ENUM_OR_ZERO("scx_ops_flags", "SCX_OPS_SWITCH_PARTIAL")
108111

112+
static inline long scx_hotplug_seq(void)
113+
{
114+
int fd;
115+
char buf[32];
116+
ssize_t len;
117+
long val;
118+
119+
fd = open("/sys/kernel/sched_ext/hotplug_seq", O_RDONLY);
120+
if (fd < 0)
121+
return -ENOENT;
122+
123+
len = read(fd, buf, sizeof(buf) - 1);
124+
SCX_BUG_ON(len <= 0, "read failed (%ld)", len);
125+
buf[len] = 0;
126+
close(fd);
127+
128+
val = strtoul(buf, NULL, 10);
129+
SCX_BUG_ON(val < 0, "invalid num hotplug events: %lu", val);
130+
131+
return val;
132+
}
133+
109134
/*
110135
* struct sched_ext_ops can change over time. If compat.bpf.h::SCX_OPS_DEFINE()
111136
* is used to define ops and compat.h::SCX_OPS_LOAD/ATTACH() are used to load
@@ -123,6 +148,7 @@ static inline bool __COMPAT_struct_has_field(const char *type, const char *field
123148
\
124149
__skel = __scx_name##__open(); \
125150
SCX_BUG_ON(!__skel, "Could not open " #__scx_name); \
151+
__skel->struct_ops.__ops_name->hotplug_seq = scx_hotplug_seq(); \
126152
__skel; \
127153
})
128154

0 commit comments

Comments
 (0)