# Looking at Marconi100's the GPU power limit data

## Goals
- Have a first look at the `GpuX_power_management_limit` data
  - Fixed 300W cap
- See if the timestamps match with `GpuX_power_usage`
  - No need to do if the cap is fixed
- Parse the `tres_per_node` to get the number of GPUs

Only runs on G5K

In [6]:
import pandas as pd
import numpy as np

## Time constants
NANOSECONDS_ONE_SECOND = 1e9

NB_GPUS = 4

df_jobs = pd.read_parquet("/home/dcarastandossantos/datamove_storage/danilo-carastan-santos/exadata/data/year_month=22-05/plugin=job_table/metric=job_info_marconi100/a_0.parquet")

power_filepaths = ["/home/dcarastandossantos/datamove_storage/danilo-carastan-santos/exadata/data/year_month=22-05/plugin=ganglia_pub/metric=Gpu"+str(x)+"_power_usage/a_0.parquet" for x in range(NB_GPUS)]

df_power = pd.concat([pd.read_parquet(x).assign(gpu=y) for x, y in zip(power_filepaths, range(NB_GPUS))]).reset_index(drop=True)

powerlimit_filepaths = ["/home/dcarastandossantos/datamove_storage/danilo-carastan-santos/exadata/data/year_month=22-05/plugin=ganglia_pub/metric=Gpu"+str(x)+"_power_management_limit/a_0.parquet" for x in range(NB_GPUS)]

df_powerlimit = pd.concat([pd.read_parquet(x).assign(gpu=y) for x, y in zip(powerlimit_filepaths, range(NB_GPUS))]).reset_index(drop=True)

df_power["timestamp_seconds"] = pd.to_datetime(df_power['timestamp']).astype(int) / NANOSECONDS_ONE_SECOND
df_powerlimit["timestamp_seconds"] = pd.to_datetime(df_powerlimit['timestamp']).astype(int) / NANOSECONDS_ONE_SECOND

df_jobs["submit_time"] = pd.to_datetime(df_jobs['submit_time']).astype(int) / NANOSECONDS_ONE_SECOND
df_jobs["start_time"] = pd.to_datetime(df_jobs['start_time']).astype(int) / NANOSECONDS_ONE_SECOND
df_jobs["end_time"] = pd.to_datetime(df_jobs['end_time']).astype(int) / NANOSECONDS_ONE_SECOND

df_power['node'] = pd.to_numeric(df_power['node'])
df_power['value'] = pd.to_numeric(df_power['value'])

#df_power = pd.concat([pd.read_parquet(x).insert(0, "gpu", y) for x,y in zip(power_filepaths, range(NB_GPUS))])

df_power

Unnamed: 0,timestamp,value,node,gpu,timestamp_seconds
0,2022-05-07 10:21:31+00:00,51.374001,72,0,1.651919e+09
1,2022-05-07 10:21:52+00:00,51.374001,72,0,1.651919e+09
2,2022-05-07 10:22:13+00:00,51.374001,72,0,1.651919e+09
3,2022-05-07 10:22:34+00:00,51.374001,72,0,1.651919e+09
4,2022-05-07 10:22:55+00:00,51.374001,72,0,1.651919e+09
...,...,...,...,...,...
452109085,2022-05-05 16:40:37+00:00,280.036011,700,3,1.651769e+09
452109086,2022-05-05 16:40:58+00:00,195.654007,700,3,1.651769e+09
452109087,2022-05-05 16:41:20+00:00,195.654007,700,3,1.651769e+09
452109088,2022-05-05 16:41:29+00:00,198.216995,700,3,1.651769e+09


In [9]:
df_powerlimit["value"].describe()

count    452109027.0
mean           300.0
std              0.0
min            300.0
25%            300.0
50%            300.0
75%            300.0
max            300.0
Name: value, dtype: float64

Uncapped GPUs, ok

`tres_per_node` with values `gres:gpu:x`. `x` seems to really refer to number of GPUs per node, and it's an input by the user

Source: https://stackoverflow.com/questions/67091056/gpu-allocation-in-slurm-gres-vs-gpus-per-task-and-mpirun-vs-srun

In [13]:
df_jobs["tres_per_node"].drop_duplicates()

0                    gres:gpu:0
1                    gres:gpu:1
2                    gres:gpu:4
36                         None
84                   gres:gpu:2
1452      gres:gpu:4,gres:gpu:4
11736                gres:gpu:3
12549                gres:gpu:8
47124               gres:gpu:12
47267                gres:gpu:6
87128                gres:sysfs
128708           gres:gpu:gpu:4
158014                 gres:gpu
Name: tres_per_node, dtype: object

In [66]:
import re

allowed_tres_values =  ["gres:gpu:0", "gres:gpu:1", "gres:gpu:2", "gres:gpu:3", "gres:gpu:4"]
#regex = "gres:gpu:([0-4])"

df_jobs_cpy = df_jobs.copy()

df_jobs_cpy = df_jobs_cpy[df_jobs_cpy["tres_per_node"].isin(allowed_tres_values)]

df_jobs_cpy["gpus_per_node"] = df_jobs_cpy["tres_per_node"].str.replace("gres:gpu:", "").astype("int32")
df_jobs_cpy["gpus_per_node"]

0         0
1         1
2         4
3         0
4         4
         ..
239937    1
239938    1
239939    1
239940    2
239941    1
Name: gpus_per_node, Length: 226520, dtype: int32