forked from hashicorp/nomad
/
check.go
135 lines (109 loc) · 3.48 KB
/
check.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
package command
import (
"fmt"
"reflect"
"strconv"
"strings"
"time"
)
const (
HealthCritical = 2
HealthWarn = 1
HealthPass = 0
HealthUnknown = 3
)
type AgentCheckCommand struct {
Meta
}
func (c *AgentCheckCommand) Help() string {
helpText := `
Usage: nomad check
Display state of the Nomad agent. The exit code of the command is Nagios
compatible and could be used with alerting systems.
General Options:
` + generalOptionsUsage() + `
Agent Check Options:
-min-peers
Minimum number of peers that a server is expected to know.
-min-servers
Minumum number of servers that a client is expected to know.
`
return strings.TrimSpace(helpText)
}
func (c *AgentCheckCommand) Synopsis() string {
return "Displays health of the local Nomad agent"
}
func (c *AgentCheckCommand) Run(args []string) int {
var minPeers, minServers int
flags := c.Meta.FlagSet("check", FlagSetClient)
flags.Usage = func() { c.Ui.Output(c.Help()) }
flags.IntVar(&minPeers, "min-peers", 0, "")
flags.IntVar(&minServers, "min-servers", 1, "")
if err := flags.Parse(args); err != nil {
return 1
}
client, err := c.Meta.Client()
if err != nil {
c.Ui.Error(fmt.Sprintf("error initializing client: %s", err))
return HealthCritical
}
info, err := client.Agent().Self()
if err != nil {
c.Ui.Output(fmt.Sprintf("unable to query agent info: %v", err))
return HealthCritical
}
if stats, ok := info["stats"]; !ok && (reflect.TypeOf(stats).Kind() == reflect.Map) {
c.Ui.Error("error getting stats from the agent api")
return 1
}
if _, ok := info["stats"]["nomad"]; ok {
return c.checkServerHealth(info["stats"], minPeers)
}
if _, ok := info["stats"]["client"]; ok {
return c.checkClientHealth(info["stats"], minServers)
}
return HealthWarn
}
// checkServerHealth returns the health of a server.
// TODO Add more rules for determining server health
func (c *AgentCheckCommand) checkServerHealth(info map[string]interface{}, minPeers int) int {
raft := info["raft"].(map[string]interface{})
knownPeers, err := strconv.Atoi(raft["num_peers"].(string))
if err != nil {
c.Ui.Output(fmt.Sprintf("unable to get known peers: %v", err))
return HealthCritical
}
if knownPeers < minPeers {
c.Ui.Output(fmt.Sprintf("known peers: %v, is less than expected number of peers: %v", knownPeers, minPeers))
return HealthCritical
}
return HealthPass
}
// checkClientHealth returns the health of a client
func (c *AgentCheckCommand) checkClientHealth(info map[string]interface{}, minServers int) int {
clientStats := info["client"].(map[string]interface{})
knownServers, err := strconv.Atoi(clientStats["known_servers"].(string))
if err != nil {
c.Ui.Output(fmt.Sprintf("unable to get known servers: %v", err))
return HealthCritical
}
heartbeatTTL, err := time.ParseDuration(clientStats["heartbeat_ttl"].(string))
if err != nil {
c.Ui.Output(fmt.Sprintf("unable to parse heartbeat TTL: %v", err))
return HealthCritical
}
lastHeartbeat, err := time.ParseDuration(clientStats["last_heartbeat"].(string))
if err != nil {
c.Ui.Output(fmt.Sprintf("unable to parse last heartbeat: %v", err))
return HealthCritical
}
if lastHeartbeat > heartbeatTTL {
c.Ui.Output(fmt.Sprintf("last heartbeat was %q time ago, expected heartbeat ttl: %q", lastHeartbeat, heartbeatTTL))
return HealthCritical
}
if knownServers < minServers {
c.Ui.Output(fmt.Sprintf("known servers: %v, is less than expected number of servers: %v", knownServers, minServers))
return HealthCritical
}
return HealthPass
}