Skip to content

Commit

Permalink
Fix GPU Dashboard (#8572)
Browse files Browse the repository at this point in the history
  • Loading branch information
quasiben committed Mar 28, 2024
1 parent e434793 commit e7f756b
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 5 deletions.
8 changes: 3 additions & 5 deletions distributed/dashboard/components/nvml.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,13 +125,11 @@ def update(self):

for idx, ws in enumerate(workers):
try:
info = ws.extra["gpu"]
mem_used = ws.metrics["gpu_memory_used"]
mem_total = ws.metrics["gpu-memory-total"]
u = ws.metrics["gpu_utilization"]
except KeyError:
continue
metrics = ws.metrics["gpu"]
u = metrics["utilization"]
mem_used = metrics["memory-used"]
mem_total = info["memory-total"]
memory_max = max(memory_max, mem_total)
memory_total += mem_total
utilization.append(int(u))
Expand Down
2 changes: 2 additions & 0 deletions distributed/system_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ def __init__(
gpu_extra = nvml.one_time()
self.gpu_name = gpu_extra["name"]
self.gpu_memory_total = gpu_extra["memory-total"]
self.quantities["gpu-memory-total"] = deque(maxlen=1)
self.quantities["gpu_utilization"] = deque(maxlen=maxlen)
self.quantities["gpu_memory_used"] = deque(maxlen=maxlen)
else:
Expand Down Expand Up @@ -207,6 +208,7 @@ def update(self) -> dict[str, Any]:

if self.gpu_name:
gpu_metrics = nvml.real_time()
result["gpu-memory-total"] = self.gpu_memory_total
result["gpu_utilization"] = gpu_metrics["utilization"]
result["gpu_memory_used"] = gpu_metrics["memory-used"]

Expand Down

0 comments on commit e7f756b

Please sign in to comment.