@@ -186,15 +186,39 @@ def sdiag_structured(self) -> Sdiag:
186186 sdiag_output = json .loads (
187187 subprocess .check_output (["sdiag" , "--all" , "--json" ], text = True )
188188 )
189-
190- return Sdiag (
191- server_thread_count = sdiag_output ["statistics" ]["server_thread_count" ],
192- agent_queue_size = sdiag_output ["statistics" ]["agent_queue_size" ],
193- agent_count = sdiag_output ["statistics" ]["agent_count" ],
194- agent_thread_count = sdiag_output ["statistics" ]["agent_thread_count" ],
195- dbd_agent_queue_size = sdiag_output ["statistics" ]["dbd_agent_queue_size" ],
189+ stats = sdiag_output ["statistics" ]
190+
191+ result = Sdiag (
192+ server_thread_count = stats .get ("server_thread_count" ),
193+ agent_queue_size = stats .get ("agent_queue_size" ),
194+ agent_count = stats .get ("agent_count" ),
195+ agent_thread_count = stats .get ("agent_thread_count" ),
196+ dbd_agent_queue_size = stats .get ("dbd_agent_queue_size" ),
197+ schedule_cycle_max = stats .get ("schedule_cycle_max" ),
198+ schedule_cycle_mean = stats .get ("schedule_cycle_mean" ),
199+ schedule_cycle_sum = stats .get ("schedule_cycle_sum" ),
200+ schedule_cycle_total = stats .get ("schedule_cycle_total" ),
201+ schedule_cycle_per_minute = stats .get ("schedule_cycle_per_minute" ),
202+ schedule_queue_length = stats .get ("schedule_queue_length" ),
203+ sdiag_jobs_submitted = stats .get ("jobs_submitted" ),
204+ sdiag_jobs_started = stats .get ("jobs_started" ),
205+ sdiag_jobs_completed = stats .get ("jobs_completed" ),
206+ sdiag_jobs_canceled = stats .get ("jobs_canceled" ),
207+ sdiag_jobs_failed = stats .get ("jobs_failed" ),
208+ sdiag_jobs_pending = stats .get ("jobs_pending" ),
209+ sdiag_jobs_running = stats .get ("jobs_running" ),
210+ bf_backfilled_jobs = stats .get ("bf_backfilled_jobs" ),
211+ bf_cycle_mean = stats .get ("bf_cycle_mean" ),
212+ bf_cycle_sum = stats .get ("bf_cycle_sum" ),
213+ bf_cycle_max = stats .get ("bf_cycle_max" ),
214+ bf_queue_len = stats .get ("bf_queue_len" ),
196215 )
197216
217+ # Reset sdiag counters after collection
218+ self ._reset_sdiag_counters ()
219+
220+ return result
221+
198222 sdiag_output = subprocess .check_output (["sdiag" , "--all" ], text = True )
199223 metric_names = {
200224 "Server thread count:" : "server_thread_count" ,
@@ -203,7 +227,7 @@ def sdiag_structured(self) -> Sdiag:
203227 "Agent thread count:" : "agent_thread_count" ,
204228 "DBD Agent queue size:" : "dbd_agent_queue_size" ,
205229 }
206- data = {
230+ data : dict [ str , Optional [ int ]] = {
207231 "server_thread_count" : 0 ,
208232 "agent_queue_size" : 0 ,
209233 "agent_count" : 0 ,
@@ -215,8 +239,56 @@ def sdiag_structured(self) -> Sdiag:
215239 lines = re .search (rf".*{ sdiag_name } .*" , sdiag_output )
216240 assert lines is not None , f"Sdiag metric { sdiag_name } not found: { lines } "
217241 data [name ] = int (lines .group ().strip (f"{ sdiag_name } " ))
242+
243+ optional_metric_names = {
244+ "Schedule cycle max:" : "schedule_cycle_max" ,
245+ "Schedule cycle mean:" : "schedule_cycle_mean" ,
246+ "Schedule cycle sum:" : "schedule_cycle_sum" ,
247+ "Schedule cycle total:" : "schedule_cycle_total" ,
248+ "Schedule cycle per minute:" : "schedule_cycle_per_minute" ,
249+ "Schedule queue length:" : "schedule_queue_length" ,
250+ "Jobs submitted:" : "sdiag_jobs_submitted" ,
251+ "Jobs started:" : "sdiag_jobs_started" ,
252+ "Jobs completed:" : "sdiag_jobs_completed" ,
253+ "Jobs canceled:" : "sdiag_jobs_canceled" ,
254+ "Jobs failed:" : "sdiag_jobs_failed" ,
255+ "Jobs pending:" : "sdiag_jobs_pending" ,
256+ "Jobs running:" : "sdiag_jobs_running" ,
257+ "Total backfilled jobs \\ (since last slurm start\\ ):" : "bf_backfilled_jobs" ,
258+ "Backfill cycle mean:" : "bf_cycle_mean" ,
259+ "Backfill cycle sum:" : "bf_cycle_sum" ,
260+ "Backfill cycle max:" : "bf_cycle_max" ,
261+ "Backfill queue length:" : "bf_queue_len" ,
262+ }
263+
264+ for sdiag_name , name in optional_metric_names .items ():
265+ match = re .search (rf"{ sdiag_name } \s*(\d+)" , sdiag_output )
266+ if match :
267+ data [name ] = int (match .group (1 ))
268+ else :
269+ data [name ] = None
270+
271+ # Reset sdiag counters after collection
272+ self ._reset_sdiag_counters ()
273+
218274 return Sdiag (** data )
219275
276+ def _reset_sdiag_counters (self ) -> None :
277+ """Reset sdiag counters after collection.
278+
279+ This requires appropriate permissions (typically root or SlurmUser).
280+ If the reset fails due to permission issues, a warning is logged.
281+ """
282+ try :
283+ subprocess .run (
284+ ["sdiag" , "--reset" ],
285+ check = True ,
286+ capture_output = True ,
287+ text = True ,
288+ )
289+ except subprocess .CalledProcessError as e :
290+ logger .warning (f"Failed to reset sdiag counters: { e .stderr .strip ()} " )
291+
220292 def sinfo_structured (self ) -> Sinfo :
221293 fieldnames = [f .name for f in fields (SinfoRow )]
222294
0 commit comments