Skip to content

Commit d84e225

Browse files
yonglimetayongl user
andauthored
[GCM] Add backfill and main scheduling sdiag stats to slurm monitor (#39)
* [sdiag] Add sdiag telemetry * Update json for test * [sdiag] Add sdiag telemetry * Update json for test --------- Co-authored-by: yongl@yongl-login-0.yongl-login.tenant-slurm.svc.cluster.local <> Co-authored-by: yongl user <yongl@yongl-login-0.yongl-login.tenant-slurm.svc.cluster.local>
1 parent 1ade0a0 commit d84e225

File tree

4 files changed

+818
-9
lines changed

4 files changed

+818
-9
lines changed

gcm/monitoring/slurm/client.py

Lines changed: 80 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -186,15 +186,39 @@ def sdiag_structured(self) -> Sdiag:
186186
sdiag_output = json.loads(
187187
subprocess.check_output(["sdiag", "--all", "--json"], text=True)
188188
)
189-
190-
return Sdiag(
191-
server_thread_count=sdiag_output["statistics"]["server_thread_count"],
192-
agent_queue_size=sdiag_output["statistics"]["agent_queue_size"],
193-
agent_count=sdiag_output["statistics"]["agent_count"],
194-
agent_thread_count=sdiag_output["statistics"]["agent_thread_count"],
195-
dbd_agent_queue_size=sdiag_output["statistics"]["dbd_agent_queue_size"],
189+
stats = sdiag_output["statistics"]
190+
191+
result = Sdiag(
192+
server_thread_count=stats.get("server_thread_count"),
193+
agent_queue_size=stats.get("agent_queue_size"),
194+
agent_count=stats.get("agent_count"),
195+
agent_thread_count=stats.get("agent_thread_count"),
196+
dbd_agent_queue_size=stats.get("dbd_agent_queue_size"),
197+
schedule_cycle_max=stats.get("schedule_cycle_max"),
198+
schedule_cycle_mean=stats.get("schedule_cycle_mean"),
199+
schedule_cycle_sum=stats.get("schedule_cycle_sum"),
200+
schedule_cycle_total=stats.get("schedule_cycle_total"),
201+
schedule_cycle_per_minute=stats.get("schedule_cycle_per_minute"),
202+
schedule_queue_length=stats.get("schedule_queue_length"),
203+
sdiag_jobs_submitted=stats.get("jobs_submitted"),
204+
sdiag_jobs_started=stats.get("jobs_started"),
205+
sdiag_jobs_completed=stats.get("jobs_completed"),
206+
sdiag_jobs_canceled=stats.get("jobs_canceled"),
207+
sdiag_jobs_failed=stats.get("jobs_failed"),
208+
sdiag_jobs_pending=stats.get("jobs_pending"),
209+
sdiag_jobs_running=stats.get("jobs_running"),
210+
bf_backfilled_jobs=stats.get("bf_backfilled_jobs"),
211+
bf_cycle_mean=stats.get("bf_cycle_mean"),
212+
bf_cycle_sum=stats.get("bf_cycle_sum"),
213+
bf_cycle_max=stats.get("bf_cycle_max"),
214+
bf_queue_len=stats.get("bf_queue_len"),
196215
)
197216

217+
# Reset sdiag counters after collection
218+
self._reset_sdiag_counters()
219+
220+
return result
221+
198222
sdiag_output = subprocess.check_output(["sdiag", "--all"], text=True)
199223
metric_names = {
200224
"Server thread count:": "server_thread_count",
@@ -203,7 +227,7 @@ def sdiag_structured(self) -> Sdiag:
203227
"Agent thread count:": "agent_thread_count",
204228
"DBD Agent queue size:": "dbd_agent_queue_size",
205229
}
206-
data = {
230+
data: dict[str, Optional[int]] = {
207231
"server_thread_count": 0,
208232
"agent_queue_size": 0,
209233
"agent_count": 0,
@@ -215,8 +239,56 @@ def sdiag_structured(self) -> Sdiag:
215239
lines = re.search(rf".*{sdiag_name}.*", sdiag_output)
216240
assert lines is not None, f"Sdiag metric {sdiag_name} not found: {lines}"
217241
data[name] = int(lines.group().strip(f"{sdiag_name}"))
242+
243+
optional_metric_names = {
244+
"Schedule cycle max:": "schedule_cycle_max",
245+
"Schedule cycle mean:": "schedule_cycle_mean",
246+
"Schedule cycle sum:": "schedule_cycle_sum",
247+
"Schedule cycle total:": "schedule_cycle_total",
248+
"Schedule cycle per minute:": "schedule_cycle_per_minute",
249+
"Schedule queue length:": "schedule_queue_length",
250+
"Jobs submitted:": "sdiag_jobs_submitted",
251+
"Jobs started:": "sdiag_jobs_started",
252+
"Jobs completed:": "sdiag_jobs_completed",
253+
"Jobs canceled:": "sdiag_jobs_canceled",
254+
"Jobs failed:": "sdiag_jobs_failed",
255+
"Jobs pending:": "sdiag_jobs_pending",
256+
"Jobs running:": "sdiag_jobs_running",
257+
"Total backfilled jobs \\(since last slurm start\\):": "bf_backfilled_jobs",
258+
"Backfill cycle mean:": "bf_cycle_mean",
259+
"Backfill cycle sum:": "bf_cycle_sum",
260+
"Backfill cycle max:": "bf_cycle_max",
261+
"Backfill queue length:": "bf_queue_len",
262+
}
263+
264+
for sdiag_name, name in optional_metric_names.items():
265+
match = re.search(rf"{sdiag_name}\s*(\d+)", sdiag_output)
266+
if match:
267+
data[name] = int(match.group(1))
268+
else:
269+
data[name] = None
270+
271+
# Reset sdiag counters after collection
272+
self._reset_sdiag_counters()
273+
218274
return Sdiag(**data)
219275

276+
def _reset_sdiag_counters(self) -> None:
277+
"""Reset sdiag counters after collection.
278+
279+
This requires appropriate permissions (typically root or SlurmUser).
280+
If the reset fails due to permission issues, a warning is logged.
281+
"""
282+
try:
283+
subprocess.run(
284+
["sdiag", "--reset"],
285+
check=True,
286+
capture_output=True,
287+
text=True,
288+
)
289+
except subprocess.CalledProcessError as e:
290+
logger.warning(f"Failed to reset sdiag counters: {e.stderr.strip()}")
291+
220292
def sinfo_structured(self) -> Sinfo:
221293
fieldnames = [f.name for f in fields(SinfoRow)]
222294

gcm/schemas/slurm/sdiag.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,27 @@ class Sdiag:
1111
agent_count: Optional[int]
1212
agent_thread_count: Optional[int]
1313
dbd_agent_queue_size: Optional[int]
14+
15+
# Schedule cycle statistics
16+
schedule_cycle_max: Optional[int] = None
17+
schedule_cycle_mean: Optional[int] = None
18+
schedule_cycle_sum: Optional[int] = None
19+
schedule_cycle_total: Optional[int] = None
20+
schedule_cycle_per_minute: Optional[int] = None
21+
schedule_queue_length: Optional[int] = None
22+
23+
# Job statistics (prefixed with sdiag_ to avoid collision with SLURMLog)
24+
sdiag_jobs_submitted: Optional[int] = None
25+
sdiag_jobs_started: Optional[int] = None
26+
sdiag_jobs_completed: Optional[int] = None
27+
sdiag_jobs_canceled: Optional[int] = None
28+
sdiag_jobs_failed: Optional[int] = None
29+
sdiag_jobs_pending: Optional[int] = None
30+
sdiag_jobs_running: Optional[int] = None
31+
32+
# Backfill statistics
33+
bf_backfilled_jobs: Optional[int] = None
34+
bf_cycle_mean: Optional[int] = None
35+
bf_cycle_sum: Optional[int] = None
36+
bf_cycle_max: Optional[int] = None
37+
bf_queue_len: Optional[int] = None

0 commit comments

Comments
 (0)