Skip to content

Commit

Permalink
gpu_sysman: remove _total suffix for monotonic metrics
Browse files Browse the repository at this point in the history
As "write_prometheus" plugin already adds that unconditionally to
names of all monotonic metric types.

Signed-off-by: Eero Tamminen <eero.t.tamminen@intel.com>
  • Loading branch information
eero-t authored and mrunge committed May 14, 2024
1 parent 1162247 commit 8fb24ac
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 23 deletions.
28 changes: 14 additions & 14 deletions src/gpu_sysman.c
Original file line number Diff line number Diff line change
Expand Up @@ -1101,54 +1101,54 @@ static bool gpu_ras(gpu_device_t *gpu) {
// https://spec.oneapi.io/level-zero/latest/sysman/PROG.html#querying-ras-errors
case ZES_RAS_ERROR_CAT_RESET:
help = "Total count of HW accelerator resets attempted by the driver";
catname = METRIC_PREFIX "resets_total";
catname = METRIC_PREFIX "resets";
correctable = false;
break;
case ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS:
help =
"Total count of (non-correctable) HW exceptions generated by the "
"way workloads program the HW";
catname = METRIC_PREFIX "programming_errors_total";
catname = METRIC_PREFIX "programming_errors";
correctable = false;
break;
case ZES_RAS_ERROR_CAT_DRIVER_ERRORS:
help =
"total count of (non-correctable) low-level driver communication "
"errors";
catname = METRIC_PREFIX "driver_errors_total";
catname = METRIC_PREFIX "driver_errors";
correctable = false;
break;
// categories which can have both correctable and uncorrectable errors
case ZES_RAS_ERROR_CAT_COMPUTE_ERRORS:
help = "Total count of errors that have occurred in the (shader) "
"accelerator HW";
catname = METRIC_PREFIX "compute_errors_total";
catname = METRIC_PREFIX "compute_errors";
break;
case ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS:
help = "Total count of errors that have occurred in the fixed-function "
"accelerator HW";
catname = METRIC_PREFIX "fixed_function_errors_total";
catname = METRIC_PREFIX "fixed_function_errors";
break;
case ZES_RAS_ERROR_CAT_CACHE_ERRORS:
help = "Total count of ECC errors that have occurred in the on-chip "
"caches";
catname = METRIC_PREFIX "cache_errors_total";
catname = METRIC_PREFIX "cache_errors";
break;
case ZES_RAS_ERROR_CAT_DISPLAY_ERRORS:
help = "Total count of ECC errors that have occurred in the display";
catname = METRIC_PREFIX "display_errors_total";
catname = METRIC_PREFIX "display_errors";
break;
default:
help = "Total count of errors in unsupported categories";
catname = METRIC_PREFIX "unknown_errors_total";
catname = METRIC_PREFIX "unknown_errors";
}
if (correctable) {
ras_submit(gpu, catname, help, type, subdev, value);
} else if (props.type == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) {
ras_submit(gpu, catname, help, NULL, subdev, value);
}
}
catname = METRIC_PREFIX "all_errors_total";
catname = METRIC_PREFIX "all_errors";
help = "Total count of errors in all categories";
ras_submit(gpu, catname, help, type, subdev, total);
ok = true;
Expand Down Expand Up @@ -1390,7 +1390,7 @@ static bool gpu_mems_bw(gpu_device_t *gpu) {
};
metric_family_t fam_counter = {
.help = "Memory bandwidth usage total (in bytes)",
.name = METRIC_PREFIX "memory_bw_bytes_total",
.name = METRIC_PREFIX "memory_bw_bytes",
.type = METRIC_TYPE_COUNTER,
};
metric_t metric = {0};
Expand Down Expand Up @@ -1768,7 +1768,7 @@ static bool gpu_freqs_throttle(gpu_device_t *gpu) {
};
metric_family_t fam_counter = {
.help = "Total time HW frequency has been throttled (in seconds)",
.name = METRIC_PREFIX "throttled_seconds_total",
.name = METRIC_PREFIX "throttled_seconds",
.type = METRIC_TYPE_COUNTER_FP,
};
metric_t metric = {0};
Expand Down Expand Up @@ -2034,7 +2034,7 @@ static bool gpu_fabrics(gpu_device_t *gpu) {
};
metric_family_t fam_counter = {
.help = "Fabric port throughput total (in bytes)",
.name = METRIC_PREFIX "fabric_port_bytes_total",
.name = METRIC_PREFIX "fabric_port_bytes",
.type = METRIC_TYPE_COUNTER,
};
metric_t metric = {0};
Expand Down Expand Up @@ -2212,7 +2212,7 @@ static bool gpu_powers(gpu_device_t *gpu) {
};
metric_family_t fam_energy = {
.help = "Total energy consumption since boot (in joules)",
.name = METRIC_PREFIX "energy_joules_total",
.name = METRIC_PREFIX "energy_joules",
.type = METRIC_TYPE_COUNTER_FP,
};
metric_t metric = {0};
Expand Down Expand Up @@ -2363,7 +2363,7 @@ static bool gpu_engines(gpu_device_t *gpu) {
metric_family_t fam_counter = {
.help = "GPU engine / group execution (use / activity) time total (in "
"seconds)",
.name = METRIC_PREFIX "engine_use_seconds_total",
.name = METRIC_PREFIX "engine_use_seconds",
.type = METRIC_TYPE_COUNTER_FP,
};
metric_t metric = {0};
Expand Down
18 changes: 9 additions & 9 deletions src/gpu_sysman_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -583,7 +583,7 @@ typedef struct {

static metrics_validation_t valid_metrics[] = {
/* gauge value changes */
{"all_errors_total", true, false, RAS_INIT, RAS_INC, 0, 0.0},
{"all_errors", true, false, RAS_INIT, RAS_INC, 0, 0.0},
{"frequency_mhz/actual/current/gpu/min", true, true, FREQ_INIT, FREQ_INC, 0,
0.0},
{"frequency_mhz/actual/current/gpu/max", true, true, FREQ_INIT, FREQ_INC, 0,
Expand Down Expand Up @@ -624,14 +624,14 @@ static metrics_validation_t valid_metrics[] = {
{"temperature_ratio", true, false, TEMP_RATIO_INIT, TEMP_RATIO_INC, 0, 0.0},

/* while counters increase, per-time incremented value should stay same */
{"energy_joules_total", true, false, COUNTER_START / 1e6, COUNTER_INC / 1e6,
0, 0.0},
{"energy_joules", true, false, COUNTER_START / 1e6, COUNTER_INC / 1e6, 0,
0.0},
{"engine_ratio/all", true, false, COUNTER_RATIO, 0, 0, 0.0},
{"engine_use_seconds_total/all", true, false, COUNTER_START / 1e6,
{"engine_use_seconds/all", true, false, COUNTER_START / 1e6,
COUNTER_INC / 1e6, 0, 0.0},
{"fabric_port_bytes_total/healthy/off/read", true, false, 2 * COUNTER_START,
{"fabric_port_bytes/healthy/off/read", true, false, 2 * COUNTER_START,
2 * COUNTER_INC, 0, 0.0},
{"fabric_port_bytes_total/healthy/off/write", true, false, COUNTER_START,
{"fabric_port_bytes/healthy/off/write", true, false, COUNTER_START,
COUNTER_INC, 0, 0.0},
{"fabric_port_bytes_per_second/healthy/off/read", true, false,
2 * COUNTER_RATE, 0, 0, 0.0},
Expand All @@ -641,9 +641,9 @@ static metrics_validation_t valid_metrics[] = {
0, 0, 0.0},
{"fabric_port_ratio/healthy/off/write", true, false, COUNTER_MAX_RATIO, 0,
0, 0.0},
{"memory_bw_bytes_total/HBM/system/read", true, false, 2 * COUNTER_START,
{"memory_bw_bytes/HBM/system/read", true, false, 2 * COUNTER_START,
2 * COUNTER_INC, 0, 0.0},
{"memory_bw_bytes_total/HBM/system/write", true, false, COUNTER_START,
{"memory_bw_bytes/HBM/system/write", true, false, COUNTER_START,
COUNTER_INC, 0, 0.0},
{"memory_bw_bytes_per_second/HBM/system/read", true, false,
2 * COUNTER_RATE, 0, 0, 0.0},
Expand All @@ -656,7 +656,7 @@ static metrics_validation_t valid_metrics[] = {
{"power_ratio", true, false, COUNTER_INC / POWER_LIMIT / TIME_INC, 0, 0,
0.0},
{"power_watts", true, false, COUNTER_RATIO, 0, 0, 0.0},
{"throttled_seconds_total/gpu", true, false, COUNTER_START / 1e6,
{"throttled_seconds/gpu", true, false, COUNTER_START / 1e6,
COUNTER_INC / 1e6, 0, 0.0},
{"throttled_ratio/gpu", true, false, COUNTER_RATIO, 0, 0, 0.0},
};
Expand Down

0 comments on commit 8fb24ac

Please sign in to comment.