-
Notifications
You must be signed in to change notification settings - Fork 296
/
metrics.rs
97 lines (89 loc) · 3.44 KB
/
metrics.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
use prometheus::{IntCounter, IntCounterVec, IntGauge, IntGaugeVec};
use strum::IntoEnumIterator;
use strum_macros::{EnumIter, IntoStaticStr};
pub const PROMETHEUS_HTTP_PORT: u16 = 9091;
#[derive(Clone)]
pub struct OrchestratorMetrics {
pub ssh_access_registry_version: IntGauge,
pub firewall_registry_version: IntGauge,
pub reboot_duration: IntGauge,
pub orchestrator_info: IntGaugeVec,
pub key_rotation_status: IntGaugeVec,
pub ecdsa_key_changed_errors: IntCounterVec,
pub failed_consecutive_upgrade_checks: IntCounter,
}
#[derive(Copy, Clone, Debug, EnumIter, Eq, IntoStaticStr, PartialOrd, Ord, PartialEq)]
pub enum KeyRotationStatus {
Disabled,
TooRecent,
Rotating,
Registering,
Registered,
Error,
}
impl KeyRotationStatus {
fn is_transient(self) -> bool {
matches!(
self,
KeyRotationStatus::Registering | KeyRotationStatus::Rotating
)
}
fn is_error(self) -> bool {
matches!(self, KeyRotationStatus::Error)
}
}
impl OrchestratorMetrics {
pub fn new(metrics_registry: &ic_metrics::MetricsRegistry) -> Self {
Self {
ssh_access_registry_version: metrics_registry.int_gauge(
"ssh_access_registry_version",
"Registry version last used to update the SSH public keys",
),
firewall_registry_version: metrics_registry.int_gauge(
"firewall_registry_version",
"Latest registry version used for firewall configuration",
),
reboot_duration: metrics_registry.int_gauge(
"reboot_duration_seconds",
"The time it took for the node to reboot",
),
orchestrator_info: metrics_registry.int_gauge_vec(
"ic_orchestrator_info",
"version info for the internet computer orchestrator running.",
&["ic_active_version"],
),
key_rotation_status: metrics_registry.int_gauge_vec(
"orchestrator_key_rotation_status",
"The current key rotation status.",
&["status"],
),
ecdsa_key_changed_errors: metrics_registry.int_counter_vec(
"orchestrator_tecdsa_key_changed_errors_total",
"Critical error counter monitoring changed tECDSA public keys",
&["key_id"],
),
failed_consecutive_upgrade_checks: metrics_registry.int_counter(
"orchestrator_failed_consecutive_upgrade_checks_total",
"Number of times the upgrade check failed consecutively",
),
}
}
/// Set the current key rotation status to the given status and clear all other states.
/// If the given status is a transient state, do not clear the error status.
pub fn observe_key_rotation_status(&self, status: KeyRotationStatus) {
// don't clear error status when going through transient states
KeyRotationStatus::iter()
.filter(|s| !status.is_transient() || !s.is_error())
.for_each(|s| {
self.key_rotation_status
.with_label_values(&[s.into()])
.set((s == status) as i64);
});
}
/// Set the error status to '1'.
pub fn observe_key_rotation_error(&self) {
self.key_rotation_status
.with_label_values(&[KeyRotationStatus::Error.into()])
.set(1);
}
}