@@ -926,6 +926,11 @@ static u32 highest_bit(u32 flags)
926926 return ((u64 )1 << bit ) >> 1 ;
927927}
928928
929+ static bool u32_before (u32 a , u32 b )
930+ {
931+ return (s32 )(a - b ) < 0 ;
932+ }
933+
929934/*
930935 * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX
931936 * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate
@@ -1066,6 +1071,73 @@ static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask,
10661071 return true;
10671072}
10681073
1074+ /**
1075+ * nldsq_next_task - Iterate to the next task in a non-local DSQ
1076+ * @dsq: user dsq being interated
1077+ * @cur: current position, %NULL to start iteration
1078+ * @rev: walk backwards
1079+ *
1080+ * Returns %NULL when iteration is finished.
1081+ */
1082+ static struct task_struct * nldsq_next_task (struct scx_dispatch_q * dsq ,
1083+ struct task_struct * cur , bool rev )
1084+ {
1085+ struct list_head * list_node ;
1086+ struct scx_dsq_list_node * dsq_lnode ;
1087+
1088+ lockdep_assert_held (& dsq -> lock );
1089+
1090+ if (cur )
1091+ list_node = & cur -> scx .dsq_list .node ;
1092+ else
1093+ list_node = & dsq -> list ;
1094+
1095+ /* find the next task, need to skip BPF iteration cursors */
1096+ do {
1097+ if (rev )
1098+ list_node = list_node -> prev ;
1099+ else
1100+ list_node = list_node -> next ;
1101+
1102+ if (list_node == & dsq -> list )
1103+ return NULL ;
1104+
1105+ dsq_lnode = container_of (list_node , struct scx_dsq_list_node ,
1106+ node );
1107+ } while (dsq_lnode -> is_bpf_iter_cursor );
1108+
1109+ return container_of (dsq_lnode , struct task_struct , scx .dsq_list );
1110+ }
1111+
1112+ #define nldsq_for_each_task (p , dsq ) \
1113+ for ((p) = nldsq_next_task((dsq), NULL, false); (p); \
1114+ (p) = nldsq_next_task((dsq), (p), false))
1115+
1116+
1117+ /*
1118+ * BPF DSQ iterator. Tasks in a non-local DSQ can be iterated in [reverse]
1119+ * dispatch order. BPF-visible iterator is opaque and larger to allow future
1120+ * changes without breaking backward compatibility. Can be used with
1121+ * bpf_for_each(). See bpf_iter_scx_dsq_*().
1122+ */
1123+ enum scx_dsq_iter_flags {
1124+ /* iterate in the reverse dispatch order */
1125+ SCX_DSQ_ITER_REV = 1U << 0 ,
1126+
1127+ __SCX_DSQ_ITER_ALL_FLAGS = SCX_DSQ_ITER_REV ,
1128+ };
1129+
1130+ struct bpf_iter_scx_dsq_kern {
1131+ struct scx_dsq_list_node cursor ;
1132+ struct scx_dispatch_q * dsq ;
1133+ u32 dsq_seq ;
1134+ u32 flags ;
1135+ } __attribute__((aligned (8 )));
1136+
1137+ struct bpf_iter_scx_dsq {
1138+ u64 __opaque [6 ];
1139+ } __attribute__((aligned (8 )));
1140+
10691141
10701142/*
10711143 * SCX task iterator.
@@ -1415,7 +1487,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
14151487 * tested easily when adding the first task.
14161488 */
14171489 if (unlikely (RB_EMPTY_ROOT (& dsq -> priq ) &&
1418- ! list_empty ( & dsq -> list )))
1490+ nldsq_next_task ( dsq , NULL , false )))
14191491 scx_ops_error ("DSQ ID 0x%016llx already had FIFO-enqueued tasks" ,
14201492 dsq -> id );
14211493
@@ -1447,6 +1519,10 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
14471519 list_add_tail (& p -> scx .dsq_list .node , & dsq -> list );
14481520 }
14491521
1522+ /* seq records the order tasks are queued, used by BPF DSQ iterator */
1523+ dsq -> seq ++ ;
1524+ p -> scx .dsq_seq = dsq -> seq ;
1525+
14501526 dsq_mod_nr (dsq , 1 );
14511527 p -> scx .dsq = dsq ;
14521528
@@ -2104,12 +2180,17 @@ static bool consume_dispatch_q(struct rq *rq, struct rq_flags *rf,
21042180{
21052181 struct task_struct * p ;
21062182retry :
2183+ /*
2184+ * The caller can't expect to successfully consume a task if the task's
2185+ * addition to @dsq isn't guaranteed to be visible somehow. Test
2186+ * @dsq->list without locking and skip if it seems empty.
2187+ */
21072188 if (list_empty (& dsq -> list ))
21082189 return false;
21092190
21102191 raw_spin_lock (& dsq -> lock );
21112192
2112- list_for_each_entry (p , & dsq -> list , scx . dsq_list . node ) {
2193+ nldsq_for_each_task (p , dsq ) {
21132194 struct rq * task_rq = task_rq (p );
21142195
21152196 if (rq == task_rq ) {
@@ -5705,6 +5786,110 @@ __bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id)
57055786 destroy_dsq (dsq_id );
57065787}
57075788
5789+ /**
5790+ * bpf_iter_scx_dsq_new - Create a DSQ iterator
5791+ * @it: iterator to initialize
5792+ * @dsq_id: DSQ to iterate
5793+ * @flags: %SCX_DSQ_ITER_*
5794+ *
5795+ * Initialize BPF iterator @it which can be used with bpf_for_each() to walk
5796+ * tasks in the DSQ specified by @dsq_id. Iteration using @it only includes
5797+ * tasks which are already queued when this function is invoked.
5798+ */
5799+ __bpf_kfunc int bpf_iter_scx_dsq_new (struct bpf_iter_scx_dsq * it , u64 dsq_id ,
5800+ u64 flags )
5801+ {
5802+ struct bpf_iter_scx_dsq_kern * kit = (void * )it ;
5803+
5804+ BUILD_BUG_ON (sizeof (struct bpf_iter_scx_dsq_kern ) >
5805+ sizeof (struct bpf_iter_scx_dsq ));
5806+ BUILD_BUG_ON (__alignof__(struct bpf_iter_scx_dsq_kern ) !=
5807+ __alignof__(struct bpf_iter_scx_dsq ));
5808+
5809+ if (flags & ~__SCX_DSQ_ITER_ALL_FLAGS )
5810+ return - EINVAL ;
5811+
5812+ kit -> dsq = find_non_local_dsq (dsq_id );
5813+ if (!kit -> dsq )
5814+ return - ENOENT ;
5815+
5816+ INIT_LIST_HEAD (& kit -> cursor .node );
5817+ kit -> cursor .is_bpf_iter_cursor = true;
5818+ kit -> dsq_seq = READ_ONCE (kit -> dsq -> seq );
5819+ kit -> flags = flags ;
5820+
5821+ return 0 ;
5822+ }
5823+
5824+ /**
5825+ * bpf_iter_scx_dsq_next - Progress a DSQ iterator
5826+ * @it: iterator to progress
5827+ *
5828+ * Return the next task. See bpf_iter_scx_dsq_new().
5829+ */
5830+ __bpf_kfunc struct task_struct * bpf_iter_scx_dsq_next (struct bpf_iter_scx_dsq * it )
5831+ {
5832+ struct bpf_iter_scx_dsq_kern * kit = (void * )it ;
5833+ bool rev = kit -> flags & SCX_DSQ_ITER_REV ;
5834+ struct task_struct * p ;
5835+ unsigned long flags ;
5836+
5837+ if (!kit -> dsq )
5838+ return NULL ;
5839+
5840+ raw_spin_lock_irqsave (& kit -> dsq -> lock , flags );
5841+
5842+ if (list_empty (& kit -> cursor .node ))
5843+ p = NULL ;
5844+ else
5845+ p = container_of (& kit -> cursor , struct task_struct , scx .dsq_list );
5846+
5847+ /*
5848+ * Only tasks which were queued before the iteration started are
5849+ * visible. This bounds BPF iterations and guarantees that vtime never
5850+ * jumps in the other direction while iterating.
5851+ */
5852+ do {
5853+ p = nldsq_next_task (kit -> dsq , p , rev );
5854+ } while (p && unlikely (u32_before (kit -> dsq_seq , p -> scx .dsq_seq )));
5855+
5856+ if (p ) {
5857+ if (rev )
5858+ list_move_tail (& kit -> cursor .node , & p -> scx .dsq_list .node );
5859+ else
5860+ list_move (& kit -> cursor .node , & p -> scx .dsq_list .node );
5861+ } else {
5862+ list_del_init (& kit -> cursor .node );
5863+ }
5864+
5865+ raw_spin_unlock_irqrestore (& kit -> dsq -> lock , flags );
5866+
5867+ return p ;
5868+ }
5869+
5870+ /**
5871+ * bpf_iter_scx_dsq_destroy - Destroy a DSQ iterator
5872+ * @it: iterator to destroy
5873+ *
5874+ * Undo scx_iter_scx_dsq_new().
5875+ */
5876+ __bpf_kfunc void bpf_iter_scx_dsq_destroy (struct bpf_iter_scx_dsq * it )
5877+ {
5878+ struct bpf_iter_scx_dsq_kern * kit = (void * )it ;
5879+
5880+ if (!kit -> dsq )
5881+ return ;
5882+
5883+ if (!list_empty (& kit -> cursor .node )) {
5884+ unsigned long flags ;
5885+
5886+ raw_spin_lock_irqsave (& kit -> dsq -> lock , flags );
5887+ list_del_init (& kit -> cursor .node );
5888+ raw_spin_unlock_irqrestore (& kit -> dsq -> lock , flags );
5889+ }
5890+ kit -> dsq = NULL ;
5891+ }
5892+
57085893__bpf_kfunc_end_defs ();
57095894
57105895static s32 __bstr_format (u64 * data_buf , char * line_buf , size_t line_size ,
@@ -6138,6 +6323,9 @@ BTF_KFUNCS_START(scx_kfunc_ids_any)
61386323BTF_ID_FLAGS (func , scx_bpf_kick_cpu )
61396324BTF_ID_FLAGS (func , scx_bpf_dsq_nr_queued )
61406325BTF_ID_FLAGS (func , scx_bpf_destroy_dsq )
6326+ BTF_ID_FLAGS (func , bpf_iter_scx_dsq_new , KF_ITER_NEW | KF_RCU_PROTECTED )
6327+ BTF_ID_FLAGS (func , bpf_iter_scx_dsq_next , KF_ITER_NEXT | KF_RET_NULL )
6328+ BTF_ID_FLAGS (func , bpf_iter_scx_dsq_destroy , KF_ITER_DESTROY )
61416329BTF_ID_FLAGS (func , scx_bpf_exit_bstr , KF_TRUSTED_ARGS )
61426330BTF_ID_FLAGS (func , scx_bpf_error_bstr , KF_TRUSTED_ARGS )
61436331BTF_ID_FLAGS (func , scx_bpf_dump_bstr , KF_TRUSTED_ARGS )
0 commit comments