Skip to content

Commit

Permalink
Merge pull request #590 from cybozu-go/add-rebooting-metrics
Browse files Browse the repository at this point in the history
Add "cke_node_reboot_status" metrics
  • Loading branch information
morimoto-cybozu committed Jan 17, 2023
2 parents c6aa43d + 4a41341 commit 06ada29
Show file tree
Hide file tree
Showing 9 changed files with 167 additions and 17 deletions.
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ This project employs a versioning scheme described in [RELEASE.md](RELEASE.md#ve

## [Unreleased]

### Added

- Add `cke_node_reboot_status` metrics [#590](https://github.com/cybozu-go/cke/pull/590)

## [1.24.0]

### Changed
Expand Down Expand Up @@ -42,7 +46,7 @@ This project employs a versioning scheme described in [RELEASE.md](RELEASE.md#ve

## Ancient changes

- See [release-1.23/CHANGELOG.md](https://github.com/cybozu-go/cke/blob/release-1.23/CHANGELOG.md) for changes in CKE 1.2.
- See [release-1.23/CHANGELOG.md](https://github.com/cybozu-go/cke/blob/release-1.23/CHANGELOG.md) for changes in CKE 1.23.
- See [release-1.22/CHANGELOG.md](https://github.com/cybozu-go/cke/blob/release-1.22/CHANGELOG.md) for changes in CKE 1.22.
- See [release-1.21/CHANGELOG.md](https://github.com/cybozu-go/cke/blob/release-1.21/CHANGELOG.md) for changes in CKE 1.21.
- See [release-1.20/CHANGELOG.md](https://github.com/cybozu-go/cke/blob/release-1.20/CHANGELOG.md) for changes in CKE 1.20.
Expand Down
23 changes: 12 additions & 11 deletions docs/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,18 @@ Metrics

CKE exposes the following metrics with the Prometheus format at `/metrics` REST API endpoint. All these metrics are prefixed with `cke_`

| Name | Description | Type | Labels |
| ------------------------------------- | -------------------------------------------------------------------------- | ----- | -------- |
| leader | True (=1) if this server is the leader of CKE. | Gauge | |
| operation_phase | 1 if CKE is operating in the phase specified by the `phase` label. | Gauge | `phase` |
| operation_phase_timestamp_seconds | The Unix timestamp when `operation_phase` was last updated. | Gauge | |
| reboot_queue_entries | The number of reboot queue entries remaining. | Gauge | |
| reboot_queue_items | The number reboot queue entries remaining per status. | Gauge | `status` |
| sabakan_integration_successful | True (=1) if sabakan-integration satisfies constraints. | Gauge | |
| sabakan_integration_timestamp_seconds | The Unix timestamp when `sabakan_integration_successful` was last updated. | Gauge | |
| sabakan_workers | The number of worker nodes for each role. | Gauge | `role` |
| sabakan_unused_machines | The number of unused machines. | Gauge | |
| Name | Description | Type | Labels |
| ------------------------------------- | -------------------------------------------------------------------------- | ----- | ---------------- |
| leader | True (=1) if this server is the leader of CKE. | Gauge | |
| node_reboot_status | The reboot status of a node. | Gauge | `node`, `status` |
| operation_phase | 1 if CKE is operating in the phase specified by the `phase` label. | Gauge | `phase` |
| operation_phase_timestamp_seconds | The Unix timestamp when `operation_phase` was last updated. | Gauge | |
| reboot_queue_entries | The number of reboot queue entries remaining. | Gauge | |
| reboot_queue_items | The number reboot queue entries remaining per status. | Gauge | `status` |
| sabakan_integration_successful | True (=1) if sabakan-integration satisfies constraints. | Gauge | |
| sabakan_integration_timestamp_seconds | The Unix timestamp when `sabakan_integration_successful` was last updated. | Gauge | |
| sabakan_workers | The number of worker nodes for each role. | Gauge | `role` |
| sabakan_unused_machines | The number of unused machines. | Gauge | |

All metrics but `leader` are available only when the server is the leader of CKE.
`sabakan_*` metrics are available only when [Sabakan integration](sabakan-integration.md) is enabled.
Expand Down
2 changes: 1 addition & 1 deletion metrics/collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ func NewCollector(client *v3.Client) prometheus.Collector {
isAvailable: isOperationPhaseAvailable,
},
"reboot": {
collectors: []prometheus.Collector{rebootQueueEntries, rebootQueueItems},
collectors: []prometheus.Collector{rebootQueueEntries, rebootQueueItems, nodeRebootStatus},
isAvailable: isRebootAvailable,
},
"sabakan_integration": {
Expand Down
8 changes: 8 additions & 0 deletions metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,14 @@ var rebootQueueItems = prometheus.NewGaugeVec(
[]string{"status"},
)

var nodeRebootStatus = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Name: "node_reboot_status",
Help: "The reboot status of a node.",
}, []string{"node", "status"},
)

var sabakanIntegrationSuccessful = prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: namespace,
Expand Down
16 changes: 16 additions & 0 deletions metrics/updater.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,22 @@ func UpdateRebootQueueItems(counts map[string]int) {
}
}

// UpdateNodeRebootStatus updates "node_reboot_status".
func UpdateNodeRebootStatus(nodeStatus map[string]map[string]bool) {
for node, statuses := range nodeStatus {
for status, matches := range statuses {
value := float64(0)
if matches {
value = 1
}
nodeRebootStatus.With(map[string]string{
"node": node,
"status": status,
}).Set(value)
}
}
}

func isRebootAvailable(_ context.Context, _ storage) (bool, error) {
return isLeader, nil
}
Expand Down
56 changes: 56 additions & 0 deletions metrics/updater_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ func TestMetricsUpdater(t *testing.T) {
t.Run("UpdateOperationPhase", testUpdateOperationPhase)
t.Run("UpdateRebootQueueEntries", testUpdateRebootQueueEntries)
t.Run("UpdateRebootQueueItems", testUpdateRebootQueueItems)
t.Run("UpdateNodeRebootStatus", testUpdateNodeRebootStatus)
t.Run("UpdateSabakanIntegration", testUpdateSabakanIntegration)
}

Expand Down Expand Up @@ -372,6 +373,61 @@ func testUpdateRebootQueueItems(t *testing.T) {
}
}

func testUpdateNodeRebootStatus(t *testing.T) {
input := map[string]map[string]bool{
"node1": {
"queued": false,
"draining": false,
"rebooting": true,
"cancelled": false,
},
"node2": {
"queued": false,
"draining": false,
"rebooting": false,
"cancelled": false,
},
}
expected := input

ctx := context.Background()
defer ctx.Done()

collector, _ := newTestCollector()
handler := GetHandler(collector)

UpdateNodeRebootStatus(input)

w := httptest.NewRecorder()
req := httptest.NewRequest("GET", "/metrics", nil)
handler.ServeHTTP(w, req)

metricsFamily, err := parseMetrics(w.Result())
if err != nil {
t.Fatal(err)
}

actual := make(map[string]map[string]bool)
for _, mf := range metricsFamily {
if *mf.Name != "cke_node_reboot_status" {
continue
}
for _, m := range mf.Metric {
labels := labelToMap(m.Label)
node := labels["node"]
status := labels["status"]
if _, ok := actual[node]; !ok {
actual[node] = make(map[string]bool)
}
actual[node][status] = *m.Gauge.Value != 0
}
}

if !cmp.Equal(actual, expected) {
t.Errorf("unexpected map was build from cke_node_reboot_status. expected: %v, actual: %v", expected, actual)
}
}

func testUpdateSabakanIntegration(t *testing.T) {
testCases := []updateSabakanIntegrationTestCase{
{
Expand Down
7 changes: 7 additions & 0 deletions op/reboot.go
Original file line number Diff line number Diff line change
Expand Up @@ -520,9 +520,16 @@ func (c rebootRecalcMetricsCommand) Run(ctx context.Context, inf cke.Infrastruct
if err != nil {
return err
}
cluster, err := inf.Storage().GetCluster(ctx)
if err != nil {
return err
}

metrics.UpdateRebootQueueEntries(len(rqEntries))
itemCounts := cke.CountRebootQueueEntries(rqEntries)
metrics.UpdateRebootQueueItems(itemCounts)
nodeStatus := cke.BuildNodeRebootStatus(cluster.Nodes, rqEntries)
metrics.UpdateNodeRebootStatus(nodeStatus)

return nil
}
Expand Down
36 changes: 32 additions & 4 deletions reboot.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ const (
RebootStatusCancelled = RebootStatus("cancelled")
)

var rebootStatuses = []RebootStatus{RebootStatusQueued, RebootStatusDraining, RebootStatusRebooting, RebootStatusCancelled}

// RebootQueueEntry represents a queue entry of reboot operation
type RebootQueueEntry struct {
Index int64 `json:"index,string"`
Expand Down Expand Up @@ -58,14 +60,40 @@ func DedupRebootQueueEntries(entries []*RebootQueueEntry) []*RebootQueueEntry {

func CountRebootQueueEntries(entries []*RebootQueueEntry) map[string]int {
ret := map[string]int{}
ret[string(RebootStatusQueued)] = 0
ret[string(RebootStatusDraining)] = 0
ret[string(RebootStatusRebooting)] = 0
ret[string(RebootStatusCancelled)] = 0
for _, status := range rebootStatuses {
// initialize explicitly to provide list of possible statuses
ret[string(status)] = 0
}

for _, entry := range entries {
ret[string(entry.Status)]++
}

return ret
}

func BuildNodeRebootStatus(nodes []*Node, entries []*RebootQueueEntry) map[string]map[string]bool {
ret := make(map[string]map[string]bool)
addr2name := make(map[string]string)

for _, node := range nodes {
name := node.Nodename()
ret[name] = make(map[string]bool)
for _, status := range rebootStatuses {
// initialize explicitly to provide list of possible statuses
ret[name][string(status)] = false
}
addr2name[node.Address] = name
}

for _, entry := range entries {
name, ok := addr2name[entry.Node]
if !ok {
// removed from K8s cluster after queued
continue
}
ret[name][string(entry.Status)] = true
}

return ret
}
30 changes: 30 additions & 0 deletions reboot_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,33 @@ func TestCountRebootQueueEntries(t *testing.T) {
t.Errorf("expected: %v, actual: %v", expected, actual)
}
}

func TestBuildNodeRebootStatus(t *testing.T) {
inputNodes := []*Node{
{Hostname: "node1", Address: "1.1.1.1"},
{Hostname: "node2", Address: "2.2.2.2"},
}
inputEntries := []*RebootQueueEntry{
{Node: "1.1.1.1", Status: RebootStatusRebooting},
{Node: "3.3.3.3", Status: RebootStatusCancelled},
}
expected := map[string]map[string]bool{
"node1": {
"queued": false,
"draining": false,
"rebooting": true,
"cancelled": false,
},
"node2": {
"queued": false,
"draining": false,
"rebooting": false,
"cancelled": false,
},
}
actual := BuildNodeRebootStatus(inputNodes, inputEntries)

if !cmp.Equal(actual, expected) {
t.Errorf("expected: %v, actual: %v", expected, actual)
}
}

0 comments on commit 06ada29

Please sign in to comment.