Skip to content

Commit 650ba21

Browse files
committed
sched_ext: Implement DSQ iterator
DSQs are very opaque in the consumption path. The BPF scheduler has no way of knowing which tasks are being considered and which is picked. This patch adds BPF DSQ iterator. - Allows iterating tasks queued on a DSQ in the dispatch order or reverse from anywhere using bpf_for_each(scx_dsq) or calling the iterator kfuncs directly. - Has ordering guarantee where only tasks which were already queued when the iteration started are visible and consumable during the iteration. v5: - Add a comment to the naked list_empty(&dsq->list) test in consume_dispatch_q() to explain the reasoning behind the lockless test and by extension why nldsq_next_task() isn't used there. - scx_qmap changes separated into its own patch. v4: - bpf_iter_scx_dsq_new() declaration in common.bpf.h was using the wrong type for the last argument (bool rev instead of u64 flags). Fix it. v3: - Alexei pointed out that the iterator is too big to allocate on stack. Added a prep patch to reduce the size of the cursor. Now bpf_iter_scx_dsq is 48 bytes and bpf_iter_scx_dsq_kern is 40 bytes on 64bit. - u32_before() comparison factored out. v2: - scx_bpf_consume_task() is separated out into a separate patch. - DSQ seq and iter flags don't need to be u64. Use u32. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: David Vernet <dvernet@meta.com> Acked-by: Alexei Starovoitov <ast@kernel.org> Cc: bpf@vger.kernel.org
1 parent d4af01c commit 650ba21

File tree

3 files changed

+196
-2
lines changed

3 files changed

+196
-2
lines changed

include/linux/sched/ext.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ struct scx_dispatch_q {
6161
struct list_head list; /* tasks in dispatch order */
6262
struct rb_root priq; /* used to order by p->scx.dsq_vtime */
6363
u32 nr;
64+
u32 seq; /* used by BPF iter */
6465
u64 id;
6566
struct rhash_head hash_node;
6667
struct llist_node free_node;
@@ -123,6 +124,7 @@ enum scx_kf_mask {
123124

124125
struct scx_dsq_list_node {
125126
struct list_head node;
127+
bool is_bpf_iter_cursor;
126128
};
127129

128130
/*
@@ -133,6 +135,7 @@ struct sched_ext_entity {
133135
struct scx_dispatch_q *dsq;
134136
struct scx_dsq_list_node dsq_list; /* dispatch order */
135137
struct rb_node dsq_priq; /* p->scx.dsq_vtime order */
138+
u32 dsq_seq;
136139
u32 dsq_flags; /* protected by DSQ lock */
137140
u32 flags; /* protected by rq lock */
138141
u32 weight;

kernel/sched/ext.c

Lines changed: 190 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -926,6 +926,11 @@ static u32 highest_bit(u32 flags)
926926
return ((u64)1 << bit) >> 1;
927927
}
928928

929+
static bool u32_before(u32 a, u32 b)
930+
{
931+
return (s32)(a - b) < 0;
932+
}
933+
929934
/*
930935
* scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX
931936
* ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate
@@ -1066,6 +1071,73 @@ static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask,
10661071
return true;
10671072
}
10681073

1074+
/**
1075+
* nldsq_next_task - Iterate to the next task in a non-local DSQ
1076+
* @dsq: user dsq being interated
1077+
* @cur: current position, %NULL to start iteration
1078+
* @rev: walk backwards
1079+
*
1080+
* Returns %NULL when iteration is finished.
1081+
*/
1082+
static struct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq,
1083+
struct task_struct *cur, bool rev)
1084+
{
1085+
struct list_head *list_node;
1086+
struct scx_dsq_list_node *dsq_lnode;
1087+
1088+
lockdep_assert_held(&dsq->lock);
1089+
1090+
if (cur)
1091+
list_node = &cur->scx.dsq_list.node;
1092+
else
1093+
list_node = &dsq->list;
1094+
1095+
/* find the next task, need to skip BPF iteration cursors */
1096+
do {
1097+
if (rev)
1098+
list_node = list_node->prev;
1099+
else
1100+
list_node = list_node->next;
1101+
1102+
if (list_node == &dsq->list)
1103+
return NULL;
1104+
1105+
dsq_lnode = container_of(list_node, struct scx_dsq_list_node,
1106+
node);
1107+
} while (dsq_lnode->is_bpf_iter_cursor);
1108+
1109+
return container_of(dsq_lnode, struct task_struct, scx.dsq_list);
1110+
}
1111+
1112+
#define nldsq_for_each_task(p, dsq) \
1113+
for ((p) = nldsq_next_task((dsq), NULL, false); (p); \
1114+
(p) = nldsq_next_task((dsq), (p), false))
1115+
1116+
1117+
/*
1118+
* BPF DSQ iterator. Tasks in a non-local DSQ can be iterated in [reverse]
1119+
* dispatch order. BPF-visible iterator is opaque and larger to allow future
1120+
* changes without breaking backward compatibility. Can be used with
1121+
* bpf_for_each(). See bpf_iter_scx_dsq_*().
1122+
*/
1123+
enum scx_dsq_iter_flags {
1124+
/* iterate in the reverse dispatch order */
1125+
SCX_DSQ_ITER_REV = 1U << 0,
1126+
1127+
__SCX_DSQ_ITER_ALL_FLAGS = SCX_DSQ_ITER_REV,
1128+
};
1129+
1130+
struct bpf_iter_scx_dsq_kern {
1131+
struct scx_dsq_list_node cursor;
1132+
struct scx_dispatch_q *dsq;
1133+
u32 dsq_seq;
1134+
u32 flags;
1135+
} __attribute__((aligned(8)));
1136+
1137+
struct bpf_iter_scx_dsq {
1138+
u64 __opaque[6];
1139+
} __attribute__((aligned(8)));
1140+
10691141

10701142
/*
10711143
* SCX task iterator.
@@ -1415,7 +1487,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
14151487
* tested easily when adding the first task.
14161488
*/
14171489
if (unlikely(RB_EMPTY_ROOT(&dsq->priq) &&
1418-
!list_empty(&dsq->list)))
1490+
nldsq_next_task(dsq, NULL, false)))
14191491
scx_ops_error("DSQ ID 0x%016llx already had FIFO-enqueued tasks",
14201492
dsq->id);
14211493

@@ -1447,6 +1519,10 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
14471519
list_add_tail(&p->scx.dsq_list.node, &dsq->list);
14481520
}
14491521

1522+
/* seq records the order tasks are queued, used by BPF DSQ iterator */
1523+
dsq->seq++;
1524+
p->scx.dsq_seq = dsq->seq;
1525+
14501526
dsq_mod_nr(dsq, 1);
14511527
p->scx.dsq = dsq;
14521528

@@ -2104,12 +2180,17 @@ static bool consume_dispatch_q(struct rq *rq, struct rq_flags *rf,
21042180
{
21052181
struct task_struct *p;
21062182
retry:
2183+
/*
2184+
* The caller can't expect to successfully consume a task if the task's
2185+
* addition to @dsq isn't guaranteed to be visible somehow. Test
2186+
* @dsq->list without locking and skip if it seems empty.
2187+
*/
21072188
if (list_empty(&dsq->list))
21082189
return false;
21092190

21102191
raw_spin_lock(&dsq->lock);
21112192

2112-
list_for_each_entry(p, &dsq->list, scx.dsq_list.node) {
2193+
nldsq_for_each_task(p, dsq) {
21132194
struct rq *task_rq = task_rq(p);
21142195

21152196
if (rq == task_rq) {
@@ -5705,6 +5786,110 @@ __bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id)
57055786
destroy_dsq(dsq_id);
57065787
}
57075788

5789+
/**
5790+
* bpf_iter_scx_dsq_new - Create a DSQ iterator
5791+
* @it: iterator to initialize
5792+
* @dsq_id: DSQ to iterate
5793+
* @flags: %SCX_DSQ_ITER_*
5794+
*
5795+
* Initialize BPF iterator @it which can be used with bpf_for_each() to walk
5796+
* tasks in the DSQ specified by @dsq_id. Iteration using @it only includes
5797+
* tasks which are already queued when this function is invoked.
5798+
*/
5799+
__bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
5800+
u64 flags)
5801+
{
5802+
struct bpf_iter_scx_dsq_kern *kit = (void *)it;
5803+
5804+
BUILD_BUG_ON(sizeof(struct bpf_iter_scx_dsq_kern) >
5805+
sizeof(struct bpf_iter_scx_dsq));
5806+
BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) !=
5807+
__alignof__(struct bpf_iter_scx_dsq));
5808+
5809+
if (flags & ~__SCX_DSQ_ITER_ALL_FLAGS)
5810+
return -EINVAL;
5811+
5812+
kit->dsq = find_non_local_dsq(dsq_id);
5813+
if (!kit->dsq)
5814+
return -ENOENT;
5815+
5816+
INIT_LIST_HEAD(&kit->cursor.node);
5817+
kit->cursor.is_bpf_iter_cursor = true;
5818+
kit->dsq_seq = READ_ONCE(kit->dsq->seq);
5819+
kit->flags = flags;
5820+
5821+
return 0;
5822+
}
5823+
5824+
/**
5825+
* bpf_iter_scx_dsq_next - Progress a DSQ iterator
5826+
* @it: iterator to progress
5827+
*
5828+
* Return the next task. See bpf_iter_scx_dsq_new().
5829+
*/
5830+
__bpf_kfunc struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it)
5831+
{
5832+
struct bpf_iter_scx_dsq_kern *kit = (void *)it;
5833+
bool rev = kit->flags & SCX_DSQ_ITER_REV;
5834+
struct task_struct *p;
5835+
unsigned long flags;
5836+
5837+
if (!kit->dsq)
5838+
return NULL;
5839+
5840+
raw_spin_lock_irqsave(&kit->dsq->lock, flags);
5841+
5842+
if (list_empty(&kit->cursor.node))
5843+
p = NULL;
5844+
else
5845+
p = container_of(&kit->cursor, struct task_struct, scx.dsq_list);
5846+
5847+
/*
5848+
* Only tasks which were queued before the iteration started are
5849+
* visible. This bounds BPF iterations and guarantees that vtime never
5850+
* jumps in the other direction while iterating.
5851+
*/
5852+
do {
5853+
p = nldsq_next_task(kit->dsq, p, rev);
5854+
} while (p && unlikely(u32_before(kit->dsq_seq, p->scx.dsq_seq)));
5855+
5856+
if (p) {
5857+
if (rev)
5858+
list_move_tail(&kit->cursor.node, &p->scx.dsq_list.node);
5859+
else
5860+
list_move(&kit->cursor.node, &p->scx.dsq_list.node);
5861+
} else {
5862+
list_del_init(&kit->cursor.node);
5863+
}
5864+
5865+
raw_spin_unlock_irqrestore(&kit->dsq->lock, flags);
5866+
5867+
return p;
5868+
}
5869+
5870+
/**
5871+
* bpf_iter_scx_dsq_destroy - Destroy a DSQ iterator
5872+
* @it: iterator to destroy
5873+
*
5874+
* Undo scx_iter_scx_dsq_new().
5875+
*/
5876+
__bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it)
5877+
{
5878+
struct bpf_iter_scx_dsq_kern *kit = (void *)it;
5879+
5880+
if (!kit->dsq)
5881+
return;
5882+
5883+
if (!list_empty(&kit->cursor.node)) {
5884+
unsigned long flags;
5885+
5886+
raw_spin_lock_irqsave(&kit->dsq->lock, flags);
5887+
list_del_init(&kit->cursor.node);
5888+
raw_spin_unlock_irqrestore(&kit->dsq->lock, flags);
5889+
}
5890+
kit->dsq = NULL;
5891+
}
5892+
57085893
__bpf_kfunc_end_defs();
57095894

57105895
static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size,
@@ -6138,6 +6323,9 @@ BTF_KFUNCS_START(scx_kfunc_ids_any)
61386323
BTF_ID_FLAGS(func, scx_bpf_kick_cpu)
61396324
BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
61406325
BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
6326+
BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED)
6327+
BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL)
6328+
BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY)
61416329
BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS)
61426330
BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS)
61436331
BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS)

tools/sched_ext/include/scx/common.bpf.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@ u32 scx_bpf_reenqueue_local(void) __ksym;
3939
void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
4040
s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
4141
void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
42+
int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, u64 flags) __ksym __weak;
43+
struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak;
44+
void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak;
4245
void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) __ksym __weak;
4346
void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym;
4447
void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym __weak;

0 commit comments

Comments
 (0)