gpu_sysman: use OTEL style metric names + add .unit types

Signed-off-by: Eero Tamminen <eero.t.tamminen@intel.com>
collectd · May 15, 2024 · deaf663 · deaf663
1 parent c61ad28
commit deaf663
Show file tree

Hide file tree

Showing 2 changed files with 168 additions and 118 deletions.
diff --git a/src/gpu_sysman.c b/src/gpu_sysman.c
@@ -28,6 +28,9 @@
  * - https://spec.oneapi.com/level-zero/latest/sysman/PROG.html
  * - https://spec.oneapi.io/level-zero/latest/sysman/api.html
  *
+ * Metrics names and labels are based on Open Telemetry spec:
+ * https://opentelemetry.io/docs/specs/semconv/system/hardware-metrics/#hwgpu---gpu-metrics
+ *
  * Error handling:
  * - Allocations are done using collectd scalloc(), smalloc() and sstrdup()
  *   helpers which log an error and exit on allocation failures
@@ -64,7 +67,7 @@
 #include "utils/common/common.h"
 
 #define PLUGIN_NAME "gpu_sysman"
-#define METRIC_PREFIX "collectd_" PLUGIN_NAME "_"
+#define METRIC_PREFIX "hw.gpu."
 
 /* collectd plugin API callback finished OK */
 #define RET_OK 0
@@ -1011,12 +1014,13 @@ static void ras_submit(gpu_device_t *gpu, const char *name, const char *help,
        */
       .name = (char *)name,
       .help = (char *)help,
+      .unit = "{error}",
   };
   metric_t m = {0};
 
   m.value.counter = value;
   if (type) {
-    metric_label_set(&m, "type", type);
+    metric_label_set(&m, "hw.error.type", type);
   }
   if (subdev) {
     metric_label_set(&m, "sub_dev", subdev);
@@ -1101,54 +1105,54 @@ static bool gpu_ras(gpu_device_t *gpu) {
         // https://spec.oneapi.io/level-zero/latest/sysman/PROG.html#querying-ras-errors
       case ZES_RAS_ERROR_CAT_RESET:
         help = "Total count of HW accelerator resets attempted by the driver";
-        catname = METRIC_PREFIX "resets";
+        catname = METRIC_PREFIX "errors.resets";
         correctable = false;
         break;
       case ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS:
         help =
             "Total count of (non-correctable) HW exceptions generated by the "
             "way workloads program the HW";
-        catname = METRIC_PREFIX "programming_errors";
+        catname = METRIC_PREFIX "errors.programming";
         correctable = false;
         break;
       case ZES_RAS_ERROR_CAT_DRIVER_ERRORS:
         help =
             "total count of (non-correctable) low-level driver communication "
             "errors";
-        catname = METRIC_PREFIX "driver_errors";
+        catname = METRIC_PREFIX "errors.driver";
         correctable = false;
         break;
         // categories which can have both correctable and uncorrectable errors
       case ZES_RAS_ERROR_CAT_COMPUTE_ERRORS:
         help = "Total count of errors that have occurred in the (shader) "
                "accelerator HW";
-        catname = METRIC_PREFIX "compute_errors";
+        catname = METRIC_PREFIX "errors.compute";
         break;
       case ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS:
         help = "Total count of errors that have occurred in the fixed-function "
                "accelerator HW";
-        catname = METRIC_PREFIX "fixed_function_errors";
+        catname = METRIC_PREFIX "errors.fixed_function";
         break;
       case ZES_RAS_ERROR_CAT_CACHE_ERRORS:
         help = "Total count of ECC errors that have occurred in the on-chip "
                "caches";
-        catname = METRIC_PREFIX "cache_errors";
+        catname = METRIC_PREFIX "errors.cache";
         break;
       case ZES_RAS_ERROR_CAT_DISPLAY_ERRORS:
         help = "Total count of ECC errors that have occurred in the display";
-        catname = METRIC_PREFIX "display_errors";
+        catname = METRIC_PREFIX "errors.display";
         break;
       default:
         help = "Total count of errors in unsupported categories";
-        catname = METRIC_PREFIX "unknown_errors";
+        catname = METRIC_PREFIX "errors.unknown";
       }
       if (correctable) {
         ras_submit(gpu, catname, help, type, subdev, value);
       } else if (props.type == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) {
         ras_submit(gpu, catname, help, NULL, subdev, value);
       }
     }
-    catname = METRIC_PREFIX "all_errors";
+    catname = METRIC_PREFIX "errors.all";
     help = "Total count of errors in all categories";
     ras_submit(gpu, catname, help, type, subdev, total);
     ok = true;
@@ -1216,13 +1220,15 @@ static bool gpu_mems(gpu_device_t *gpu, unsigned int cache_idx) {
 
   metric_family_t fam_bytes = {
       .help = "Sampled memory usage (in bytes)",
-      .name = METRIC_PREFIX "memory_used_bytes",
+      .name = METRIC_PREFIX "memory.usage",
       .type = METRIC_TYPE_UP_DOWN,
+      .unit = "By",
   };
   metric_family_t fam_ratio = {
       .help = "Sampled memory usage ratio (0-1)",
-      .name = METRIC_PREFIX "memory_usage_ratio",
+      .name = METRIC_PREFIX "memory.utilization",
       .type = METRIC_TYPE_GAUGE,
+      .unit = "1",
   };
   metric_t metric = {0};
 
@@ -1330,22 +1336,22 @@ static bool gpu_mems(gpu_device_t *gpu, unsigned int cache_idx) {
 static void add_bw_rates(metric_t *metric, metric_family_t *fam, int64_t reads,
                          int64_t writes) {
   metric->value.up_down = reads;
-  metric_label_set(metric, "direction", "read");
+  metric_label_set(metric, "direction", "receive");
   metric_family_metric_append(fam, *metric);
 
   metric->value.up_down = writes;
-  metric_label_set(metric, "direction", "write");
+  metric_label_set(metric, "direction", "transmit");
   metric_family_metric_append(fam, *metric);
 }
 
 static void add_bw_ratios(metric_t *metric, metric_family_t *fam, double reads,
                           double writes) {
   metric->value.gauge = reads;
-  metric_label_set(metric, "direction", "read");
+  metric_label_set(metric, "direction", "receive");
   metric_family_metric_append(fam, *metric);
 
   metric->value.gauge = writes;
-  metric_label_set(metric, "direction", "write");
+  metric_label_set(metric, "direction", "transmit");
   metric_family_metric_append(fam, *metric);
 }
 
@@ -1378,20 +1384,26 @@ static bool gpu_mems_bw(gpu_device_t *gpu) {
     gpu->membw_count = mem_count;
   }
 
+  /* names based on network metrics:
+   * https://opentelemetry.io/docs/specs/semconv/system/hardware-metrics/#hwnetwork---network-adapter-metrics
+   */
   metric_family_t fam_ratio = {
       .help = "Average memory bandwidth usage ratio (0-1) over query interval",
-      .name = METRIC_PREFIX "memory_bw_ratio",
+      .name = METRIC_PREFIX "memory.bandwidth.utilization",
       .type = METRIC_TYPE_GAUGE,
+      .unit = "1",
   };
   metric_family_t fam_rate = {
       .help = "Memory bandwidth usage rate (in bytes per second)",
-      .name = METRIC_PREFIX "memory_bw_bytes_per_second",
+      .name = METRIC_PREFIX "memory.io.rate",
       .type = METRIC_TYPE_UP_DOWN,
+      .unit = "By/s",
   };
   metric_family_t fam_counter = {
       .help = "Memory bandwidth usage total (in bytes)",
-      .name = METRIC_PREFIX "memory_bw_bytes",
+      .name = METRIC_PREFIX "memory.io",
       .type = METRIC_TYPE_COUNTER,
+      .unit = "By",
   };
   metric_t metric = {0};
 
@@ -1414,11 +1426,11 @@ static bool gpu_mems_bw(gpu_device_t *gpu) {
     }
     if (config.output & OUTPUT_BASE) {
       metric.value.counter = bw.writeCounter;
-      metric_label_set(&metric, "direction", "write");
+      metric_label_set(&metric, "direction", "transmit");
       metric_family_metric_append(&fam_counter, metric);
 
       metric.value.counter = bw.readCounter;
-      metric_label_set(&metric, "direction", "read");
+      metric_label_set(&metric, "direction", "receive");
       metric_family_metric_append(&fam_counter, metric);
       reported_base = true;
     }
@@ -1558,14 +1570,16 @@ static bool gpu_freqs(gpu_device_t *gpu, unsigned int cache_idx) {
   }
 
   metric_family_t fam_freq = {
-      .help = "Sampled HW frequency (in MHz)",
-      .name = METRIC_PREFIX "frequency_mhz",
+      .help = "Sampled HW frequency (in Hz)",
+      .name = METRIC_PREFIX "frequency",
       .type = METRIC_TYPE_GAUGE,
+      .unit = "Hz",
   };
   metric_family_t fam_ratio = {
       .help = "Sampled HW frequency ratio vs (non-overclocked) max frequency",
-      .name = METRIC_PREFIX "frequency_ratio",
+      .name = METRIC_PREFIX "frequency.ratio",
       .type = METRIC_TYPE_GAUGE,
+      .unit = "1",
   };
   metric_t metric = {0};
 
@@ -1597,14 +1611,14 @@ static bool gpu_freqs(gpu_device_t *gpu, unsigned int cache_idx) {
 
     if (config.samples < 2) {
       set_freq_throttled_label(&metric, gpu->frequency[0][i].throttleReasons);
-      /* negative value = unsupported:
-       * https://spec.oneapi.com/level-zero/latest/sysman/api.html#_CPPv416zes_freq_state_t
+      /* L0 values are in MHz. Negative value = unsupported:
+       * https://spec.oneapi.io/level-zero/latest/sysman/api.html#zes-freq-state-t
        */
       value = gpu->frequency[0][i].request;
       if (value >= 0) {
         metric_label_set(&metric, "type", "request");
         if (config.output & OUTPUT_BASE) {
-          metric.value.gauge = value;
+          metric.value.gauge = 1e6 * value;
           metric_family_metric_append(&fam_freq, metric);
           reported_base = true;
         }
@@ -1618,7 +1632,7 @@ static bool gpu_freqs(gpu_device_t *gpu, unsigned int cache_idx) {
       if (value >= 0) {
         metric_label_set(&metric, "type", "actual");
         if (config.output & OUTPUT_BASE) {
-          metric.value.gauge = value;
+          metric.value.gauge = 1e6 * value;
           metric_family_metric_append(&fam_freq, metric);
           reported_base = true;
         }
@@ -1657,7 +1671,7 @@ static bool gpu_freqs(gpu_device_t *gpu, unsigned int cache_idx) {
         metric_label_set(&metric, "type", "request");
         metric_label_set(&metric, "function", "min");
         if (config.output & OUTPUT_BASE) {
-          metric.value.gauge = req_min;
+          metric.value.gauge = 1e6 * req_min;
           metric_family_metric_append(&fam_freq, metric);
           reported_base = true;
         }
@@ -1668,7 +1682,7 @@ static bool gpu_freqs(gpu_device_t *gpu, unsigned int cache_idx) {
         }
         metric_label_set(&metric, "function", "max");
         if (config.output & OUTPUT_BASE) {
-          metric.value.gauge = req_max;
+          metric.value.gauge = 1e6 * req_max;
           metric_family_metric_append(&fam_freq, metric);
           reported_base = true;
         }
@@ -1682,7 +1696,7 @@ static bool gpu_freqs(gpu_device_t *gpu, unsigned int cache_idx) {
         metric_label_set(&metric, "type", "actual");
         metric_label_set(&metric, "function", "min");
         if (config.output & OUTPUT_BASE) {
-          metric.value.gauge = act_min;
+          metric.value.gauge = 1e6 * act_min;
           metric_family_metric_append(&fam_freq, metric);
           reported_base = true;
         }
@@ -1693,7 +1707,7 @@ static bool gpu_freqs(gpu_device_t *gpu, unsigned int cache_idx) {
         }
         metric_label_set(&metric, "function", "max");
         if (config.output & OUTPUT_BASE) {
-          metric.value.gauge = act_max;
+          metric.value.gauge = 1e6 * act_max;
           metric_family_metric_append(&fam_freq, metric);
           reported_base = true;
         }
@@ -1763,13 +1777,15 @@ static bool gpu_freqs_throttle(gpu_device_t *gpu) {
   metric_family_t fam_ratio = {
       .help =
           "Ratio (0-1) of HW frequency being throttled during query interval",
-      .name = METRIC_PREFIX "throttled_ratio",
+      .name = METRIC_PREFIX "throttled",
       .type = METRIC_TYPE_GAUGE,
+      .unit = "1",
   };
   metric_family_t fam_counter = {
       .help = "Total time HW frequency has been throttled (in seconds)",
-      .name = METRIC_PREFIX "throttled_seconds",
+      .name = METRIC_PREFIX "throttled.time",
       .type = METRIC_TYPE_COUNTER_FP,
+      .unit = "s",
   };
   metric_t metric = {0};
 
@@ -1852,13 +1868,15 @@ static bool gpu_temps(gpu_device_t *gpu) {
 
   metric_family_t fam_temp = {
       .help = "Temperature sensor value (in Celsius) when queried",
-      .name = METRIC_PREFIX "temperature_celsius",
+      .name = METRIC_PREFIX "temperature",
       .type = METRIC_TYPE_GAUGE,
+      .unit = "Cel",
   };
   metric_family_t fam_ratio = {
       .help = "Temperature sensor value ratio to its max value when queried",
-      .name = METRIC_PREFIX "temperature_ratio",
+      .name = METRIC_PREFIX "temperature.ratio",
       .type = METRIC_TYPE_GAUGE,
+      .unit = "1",
   };
   metric_t metric = {0};
 
@@ -2021,21 +2039,27 @@ static bool gpu_fabrics(gpu_device_t *gpu) {
     gpu->fabric_count = port_count;
   }
 
+  /* names based on network metrics:
+   * https://opentelemetry.io/docs/specs/semconv/system/hardware-metrics/#hwnetwork---network-adapter-metrics
+   */
   metric_family_t fam_ratio = {
       .help =
           "Average fabric port bandwidth usage ratio (0-1) over query interval",
-      .name = METRIC_PREFIX "fabric_port_ratio",
+      .name = METRIC_PREFIX "fabric.bandwidth.utilization",
       .type = METRIC_TYPE_GAUGE,
+      .unit = "1",
   };
   metric_family_t fam_rate = {
       .help = "Fabric port throughput rate (in bytes per second)",
-      .name = METRIC_PREFIX "fabric_port_bytes_per_second",
+      .name = METRIC_PREFIX "fabric.io.rate",
       .type = METRIC_TYPE_UP_DOWN,
+      .unit = "By/s",
   };
   metric_family_t fam_counter = {
       .help = "Fabric port throughput total (in bytes)",
-      .name = METRIC_PREFIX "fabric_port_bytes",
+      .name = METRIC_PREFIX "fabric.io",
       .type = METRIC_TYPE_COUNTER,
+      .unit = "By",
   };
   metric_t metric = {0};
 
@@ -2118,11 +2142,11 @@ static bool gpu_fabrics(gpu_device_t *gpu) {
 
     if (config.output & OUTPUT_BASE) {
       metric.value.counter = bw.txCounter;
-      metric_label_set(&metric, "direction", "write");
+      metric_label_set(&metric, "direction", "transmit");
       metric_family_metric_append(&fam_counter, metric);
 
       metric.value.counter = bw.rxCounter;
-      metric_label_set(&metric, "direction", "read");
+      metric_label_set(&metric, "direction", "receive");
       metric_family_metric_append(&fam_counter, metric);
       reported_base = true;
     }
@@ -2202,18 +2226,21 @@ static bool gpu_powers(gpu_device_t *gpu) {
   metric_family_t fam_ratio = {
       .help = "Ratio of average power usage vs sustained or burst "
               "power limit",
-      .name = METRIC_PREFIX "power_ratio",
+      .name = METRIC_PREFIX "power.utilization",
       .type = METRIC_TYPE_GAUGE,
+      .unit = "1",
   };
   metric_family_t fam_power = {
       .help = "Average power usage (in Watts) over query interval",
-      .name = METRIC_PREFIX "power_watts",
+      .name = METRIC_PREFIX "power",
       .type = METRIC_TYPE_UP_DOWN_FP,
+      .unit = "W",
   };
   metric_family_t fam_energy = {
       .help = "Total energy consumption since boot (in joules)",
-      .name = METRIC_PREFIX "energy_joules",
+      .name = METRIC_PREFIX "energy",
       .type = METRIC_TYPE_COUNTER_FP,
+      .unit = "J",
   };
   metric_t metric = {0};
 
@@ -2357,14 +2384,16 @@ static bool gpu_engines(gpu_device_t *gpu) {
   metric_family_t fam_ratio = {
       .help = "Average GPU engine / group utilization ratio (0-1) over query "
               "interval",
-      .name = METRIC_PREFIX "engine_ratio",
+      .name = METRIC_PREFIX "engine.utilization",
       .type = METRIC_TYPE_GAUGE,
+      .unit = "1",
   };
   metric_family_t fam_counter = {
       .help = "GPU engine / group execution (use / activity) time total (in "
               "seconds)",
-      .name = METRIC_PREFIX "engine_use_seconds",
+      .name = METRIC_PREFIX "engine.time",
       .type = METRIC_TYPE_COUNTER_FP,
+      .unit = "s",
   };
   metric_t metric = {0};