Skip to content

Commit

Permalink
gpu_sysman: use OTEL style metric names + add .unit types
Browse files Browse the repository at this point in the history
Signed-off-by: Eero Tamminen <eero.t.tamminen@intel.com>
  • Loading branch information
eero-t committed May 15, 2024
1 parent c61ad28 commit deaf663
Show file tree
Hide file tree
Showing 2 changed files with 168 additions and 118 deletions.
123 changes: 76 additions & 47 deletions src/gpu_sysman.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@
* - https://spec.oneapi.com/level-zero/latest/sysman/PROG.html
* - https://spec.oneapi.io/level-zero/latest/sysman/api.html
*
* Metrics names and labels are based on Open Telemetry spec:
* https://opentelemetry.io/docs/specs/semconv/system/hardware-metrics/#hwgpu---gpu-metrics
*
* Error handling:
* - Allocations are done using collectd scalloc(), smalloc() and sstrdup()
* helpers which log an error and exit on allocation failures
Expand Down Expand Up @@ -64,7 +67,7 @@
#include "utils/common/common.h"

#define PLUGIN_NAME "gpu_sysman"
#define METRIC_PREFIX "collectd_" PLUGIN_NAME "_"
#define METRIC_PREFIX "hw.gpu."

/* collectd plugin API callback finished OK */
#define RET_OK 0
Expand Down Expand Up @@ -1011,12 +1014,13 @@ static void ras_submit(gpu_device_t *gpu, const char *name, const char *help,
*/
.name = (char *)name,
.help = (char *)help,
.unit = "{error}",
};
metric_t m = {0};

m.value.counter = value;
if (type) {
metric_label_set(&m, "type", type);
metric_label_set(&m, "hw.error.type", type);
}
if (subdev) {
metric_label_set(&m, "sub_dev", subdev);
Expand Down Expand Up @@ -1101,54 +1105,54 @@ static bool gpu_ras(gpu_device_t *gpu) {
// https://spec.oneapi.io/level-zero/latest/sysman/PROG.html#querying-ras-errors
case ZES_RAS_ERROR_CAT_RESET:
help = "Total count of HW accelerator resets attempted by the driver";
catname = METRIC_PREFIX "resets";
catname = METRIC_PREFIX "errors.resets";
correctable = false;
break;
case ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS:
help =
"Total count of (non-correctable) HW exceptions generated by the "
"way workloads program the HW";
catname = METRIC_PREFIX "programming_errors";
catname = METRIC_PREFIX "errors.programming";
correctable = false;
break;
case ZES_RAS_ERROR_CAT_DRIVER_ERRORS:
help =
"total count of (non-correctable) low-level driver communication "
"errors";
catname = METRIC_PREFIX "driver_errors";
catname = METRIC_PREFIX "errors.driver";
correctable = false;
break;
// categories which can have both correctable and uncorrectable errors
case ZES_RAS_ERROR_CAT_COMPUTE_ERRORS:
help = "Total count of errors that have occurred in the (shader) "
"accelerator HW";
catname = METRIC_PREFIX "compute_errors";
catname = METRIC_PREFIX "errors.compute";
break;
case ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS:
help = "Total count of errors that have occurred in the fixed-function "
"accelerator HW";
catname = METRIC_PREFIX "fixed_function_errors";
catname = METRIC_PREFIX "errors.fixed_function";
break;
case ZES_RAS_ERROR_CAT_CACHE_ERRORS:
help = "Total count of ECC errors that have occurred in the on-chip "
"caches";
catname = METRIC_PREFIX "cache_errors";
catname = METRIC_PREFIX "errors.cache";
break;
case ZES_RAS_ERROR_CAT_DISPLAY_ERRORS:
help = "Total count of ECC errors that have occurred in the display";
catname = METRIC_PREFIX "display_errors";
catname = METRIC_PREFIX "errors.display";
break;
default:
help = "Total count of errors in unsupported categories";
catname = METRIC_PREFIX "unknown_errors";
catname = METRIC_PREFIX "errors.unknown";
}
if (correctable) {
ras_submit(gpu, catname, help, type, subdev, value);
} else if (props.type == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) {
ras_submit(gpu, catname, help, NULL, subdev, value);
}
}
catname = METRIC_PREFIX "all_errors";
catname = METRIC_PREFIX "errors.all";
help = "Total count of errors in all categories";
ras_submit(gpu, catname, help, type, subdev, total);
ok = true;
Expand Down Expand Up @@ -1216,13 +1220,15 @@ static bool gpu_mems(gpu_device_t *gpu, unsigned int cache_idx) {

metric_family_t fam_bytes = {
.help = "Sampled memory usage (in bytes)",
.name = METRIC_PREFIX "memory_used_bytes",
.name = METRIC_PREFIX "memory.usage",
.type = METRIC_TYPE_UP_DOWN,
.unit = "By",
};
metric_family_t fam_ratio = {
.help = "Sampled memory usage ratio (0-1)",
.name = METRIC_PREFIX "memory_usage_ratio",
.name = METRIC_PREFIX "memory.utilization",
.type = METRIC_TYPE_GAUGE,
.unit = "1",
};
metric_t metric = {0};

Expand Down Expand Up @@ -1330,22 +1336,22 @@ static bool gpu_mems(gpu_device_t *gpu, unsigned int cache_idx) {
static void add_bw_rates(metric_t *metric, metric_family_t *fam, int64_t reads,
int64_t writes) {
metric->value.up_down = reads;
metric_label_set(metric, "direction", "read");
metric_label_set(metric, "direction", "receive");
metric_family_metric_append(fam, *metric);

metric->value.up_down = writes;
metric_label_set(metric, "direction", "write");
metric_label_set(metric, "direction", "transmit");
metric_family_metric_append(fam, *metric);
}

static void add_bw_ratios(metric_t *metric, metric_family_t *fam, double reads,
double writes) {
metric->value.gauge = reads;
metric_label_set(metric, "direction", "read");
metric_label_set(metric, "direction", "receive");
metric_family_metric_append(fam, *metric);

metric->value.gauge = writes;
metric_label_set(metric, "direction", "write");
metric_label_set(metric, "direction", "transmit");
metric_family_metric_append(fam, *metric);
}

Expand Down Expand Up @@ -1378,20 +1384,26 @@ static bool gpu_mems_bw(gpu_device_t *gpu) {
gpu->membw_count = mem_count;
}

/* names based on network metrics:
* https://opentelemetry.io/docs/specs/semconv/system/hardware-metrics/#hwnetwork---network-adapter-metrics
*/
metric_family_t fam_ratio = {
.help = "Average memory bandwidth usage ratio (0-1) over query interval",
.name = METRIC_PREFIX "memory_bw_ratio",
.name = METRIC_PREFIX "memory.bandwidth.utilization",
.type = METRIC_TYPE_GAUGE,
.unit = "1",
};
metric_family_t fam_rate = {
.help = "Memory bandwidth usage rate (in bytes per second)",
.name = METRIC_PREFIX "memory_bw_bytes_per_second",
.name = METRIC_PREFIX "memory.io.rate",
.type = METRIC_TYPE_UP_DOWN,
.unit = "By/s",
};
metric_family_t fam_counter = {
.help = "Memory bandwidth usage total (in bytes)",
.name = METRIC_PREFIX "memory_bw_bytes",
.name = METRIC_PREFIX "memory.io",
.type = METRIC_TYPE_COUNTER,
.unit = "By",
};
metric_t metric = {0};

Expand All @@ -1414,11 +1426,11 @@ static bool gpu_mems_bw(gpu_device_t *gpu) {
}
if (config.output & OUTPUT_BASE) {
metric.value.counter = bw.writeCounter;
metric_label_set(&metric, "direction", "write");
metric_label_set(&metric, "direction", "transmit");
metric_family_metric_append(&fam_counter, metric);

metric.value.counter = bw.readCounter;
metric_label_set(&metric, "direction", "read");
metric_label_set(&metric, "direction", "receive");
metric_family_metric_append(&fam_counter, metric);
reported_base = true;
}
Expand Down Expand Up @@ -1558,14 +1570,16 @@ static bool gpu_freqs(gpu_device_t *gpu, unsigned int cache_idx) {
}

metric_family_t fam_freq = {
.help = "Sampled HW frequency (in MHz)",
.name = METRIC_PREFIX "frequency_mhz",
.help = "Sampled HW frequency (in Hz)",
.name = METRIC_PREFIX "frequency",
.type = METRIC_TYPE_GAUGE,
.unit = "Hz",
};
metric_family_t fam_ratio = {
.help = "Sampled HW frequency ratio vs (non-overclocked) max frequency",
.name = METRIC_PREFIX "frequency_ratio",
.name = METRIC_PREFIX "frequency.ratio",
.type = METRIC_TYPE_GAUGE,
.unit = "1",
};
metric_t metric = {0};

Expand Down Expand Up @@ -1597,14 +1611,14 @@ static bool gpu_freqs(gpu_device_t *gpu, unsigned int cache_idx) {

if (config.samples < 2) {
set_freq_throttled_label(&metric, gpu->frequency[0][i].throttleReasons);
/* negative value = unsupported:
* https://spec.oneapi.com/level-zero/latest/sysman/api.html#_CPPv416zes_freq_state_t
/* L0 values are in MHz. Negative value = unsupported:
* https://spec.oneapi.io/level-zero/latest/sysman/api.html#zes-freq-state-t
*/
value = gpu->frequency[0][i].request;
if (value >= 0) {
metric_label_set(&metric, "type", "request");
if (config.output & OUTPUT_BASE) {
metric.value.gauge = value;
metric.value.gauge = 1e6 * value;
metric_family_metric_append(&fam_freq, metric);
reported_base = true;
}
Expand All @@ -1618,7 +1632,7 @@ static bool gpu_freqs(gpu_device_t *gpu, unsigned int cache_idx) {
if (value >= 0) {
metric_label_set(&metric, "type", "actual");
if (config.output & OUTPUT_BASE) {
metric.value.gauge = value;
metric.value.gauge = 1e6 * value;
metric_family_metric_append(&fam_freq, metric);
reported_base = true;
}
Expand Down Expand Up @@ -1657,7 +1671,7 @@ static bool gpu_freqs(gpu_device_t *gpu, unsigned int cache_idx) {
metric_label_set(&metric, "type", "request");
metric_label_set(&metric, "function", "min");
if (config.output & OUTPUT_BASE) {
metric.value.gauge = req_min;
metric.value.gauge = 1e6 * req_min;
metric_family_metric_append(&fam_freq, metric);
reported_base = true;
}
Expand All @@ -1668,7 +1682,7 @@ static bool gpu_freqs(gpu_device_t *gpu, unsigned int cache_idx) {
}
metric_label_set(&metric, "function", "max");
if (config.output & OUTPUT_BASE) {
metric.value.gauge = req_max;
metric.value.gauge = 1e6 * req_max;
metric_family_metric_append(&fam_freq, metric);
reported_base = true;
}
Expand All @@ -1682,7 +1696,7 @@ static bool gpu_freqs(gpu_device_t *gpu, unsigned int cache_idx) {
metric_label_set(&metric, "type", "actual");
metric_label_set(&metric, "function", "min");
if (config.output & OUTPUT_BASE) {
metric.value.gauge = act_min;
metric.value.gauge = 1e6 * act_min;
metric_family_metric_append(&fam_freq, metric);
reported_base = true;
}
Expand All @@ -1693,7 +1707,7 @@ static bool gpu_freqs(gpu_device_t *gpu, unsigned int cache_idx) {
}
metric_label_set(&metric, "function", "max");
if (config.output & OUTPUT_BASE) {
metric.value.gauge = act_max;
metric.value.gauge = 1e6 * act_max;
metric_family_metric_append(&fam_freq, metric);
reported_base = true;
}
Expand Down Expand Up @@ -1763,13 +1777,15 @@ static bool gpu_freqs_throttle(gpu_device_t *gpu) {
metric_family_t fam_ratio = {
.help =
"Ratio (0-1) of HW frequency being throttled during query interval",
.name = METRIC_PREFIX "throttled_ratio",
.name = METRIC_PREFIX "throttled",
.type = METRIC_TYPE_GAUGE,
.unit = "1",
};
metric_family_t fam_counter = {
.help = "Total time HW frequency has been throttled (in seconds)",
.name = METRIC_PREFIX "throttled_seconds",
.name = METRIC_PREFIX "throttled.time",
.type = METRIC_TYPE_COUNTER_FP,
.unit = "s",
};
metric_t metric = {0};

Expand Down Expand Up @@ -1852,13 +1868,15 @@ static bool gpu_temps(gpu_device_t *gpu) {

metric_family_t fam_temp = {
.help = "Temperature sensor value (in Celsius) when queried",
.name = METRIC_PREFIX "temperature_celsius",
.name = METRIC_PREFIX "temperature",
.type = METRIC_TYPE_GAUGE,
.unit = "Cel",
};
metric_family_t fam_ratio = {
.help = "Temperature sensor value ratio to its max value when queried",
.name = METRIC_PREFIX "temperature_ratio",
.name = METRIC_PREFIX "temperature.ratio",
.type = METRIC_TYPE_GAUGE,
.unit = "1",
};
metric_t metric = {0};

Expand Down Expand Up @@ -2021,21 +2039,27 @@ static bool gpu_fabrics(gpu_device_t *gpu) {
gpu->fabric_count = port_count;
}

/* names based on network metrics:
* https://opentelemetry.io/docs/specs/semconv/system/hardware-metrics/#hwnetwork---network-adapter-metrics
*/
metric_family_t fam_ratio = {
.help =
"Average fabric port bandwidth usage ratio (0-1) over query interval",
.name = METRIC_PREFIX "fabric_port_ratio",
.name = METRIC_PREFIX "fabric.bandwidth.utilization",
.type = METRIC_TYPE_GAUGE,
.unit = "1",
};
metric_family_t fam_rate = {
.help = "Fabric port throughput rate (in bytes per second)",
.name = METRIC_PREFIX "fabric_port_bytes_per_second",
.name = METRIC_PREFIX "fabric.io.rate",
.type = METRIC_TYPE_UP_DOWN,
.unit = "By/s",
};
metric_family_t fam_counter = {
.help = "Fabric port throughput total (in bytes)",
.name = METRIC_PREFIX "fabric_port_bytes",
.name = METRIC_PREFIX "fabric.io",
.type = METRIC_TYPE_COUNTER,
.unit = "By",
};
metric_t metric = {0};

Expand Down Expand Up @@ -2118,11 +2142,11 @@ static bool gpu_fabrics(gpu_device_t *gpu) {

if (config.output & OUTPUT_BASE) {
metric.value.counter = bw.txCounter;
metric_label_set(&metric, "direction", "write");
metric_label_set(&metric, "direction", "transmit");
metric_family_metric_append(&fam_counter, metric);

metric.value.counter = bw.rxCounter;
metric_label_set(&metric, "direction", "read");
metric_label_set(&metric, "direction", "receive");
metric_family_metric_append(&fam_counter, metric);
reported_base = true;
}
Expand Down Expand Up @@ -2202,18 +2226,21 @@ static bool gpu_powers(gpu_device_t *gpu) {
metric_family_t fam_ratio = {
.help = "Ratio of average power usage vs sustained or burst "
"power limit",
.name = METRIC_PREFIX "power_ratio",
.name = METRIC_PREFIX "power.utilization",
.type = METRIC_TYPE_GAUGE,
.unit = "1",
};
metric_family_t fam_power = {
.help = "Average power usage (in Watts) over query interval",
.name = METRIC_PREFIX "power_watts",
.name = METRIC_PREFIX "power",
.type = METRIC_TYPE_UP_DOWN_FP,
.unit = "W",
};
metric_family_t fam_energy = {
.help = "Total energy consumption since boot (in joules)",
.name = METRIC_PREFIX "energy_joules",
.name = METRIC_PREFIX "energy",
.type = METRIC_TYPE_COUNTER_FP,
.unit = "J",
};
metric_t metric = {0};

Expand Down Expand Up @@ -2357,14 +2384,16 @@ static bool gpu_engines(gpu_device_t *gpu) {
metric_family_t fam_ratio = {
.help = "Average GPU engine / group utilization ratio (0-1) over query "
"interval",
.name = METRIC_PREFIX "engine_ratio",
.name = METRIC_PREFIX "engine.utilization",
.type = METRIC_TYPE_GAUGE,
.unit = "1",
};
metric_family_t fam_counter = {
.help = "GPU engine / group execution (use / activity) time total (in "
"seconds)",
.name = METRIC_PREFIX "engine_use_seconds",
.name = METRIC_PREFIX "engine.time",
.type = METRIC_TYPE_COUNTER_FP,
.unit = "s",
};
metric_t metric = {0};

Expand Down

0 comments on commit deaf663

Please sign in to comment.