/
health.go
117 lines (97 loc) · 2.96 KB
/
health.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
/*
Copyright 2017 The Rook Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package osd
import (
"time"
"github.com/rook/rook/pkg/clusterd"
"github.com/rook/rook/pkg/daemon/ceph/client"
)
const upStatus = 1
var (
healthCheckInterval = 60 * time.Second
osdGracePeriod = 600 * time.Second
)
// Monitor defines OSD process monitoring
type Monitor struct {
context *clusterd.Context
clusterName string
// lastStatus keeps track of OSDs status
// key - OSD id; value: time of the status change.
lastStatus map[int]time.Time
}
// newMonitor instantiates OSD monitoring
func NewMonitor(context *clusterd.Context, clusterName string) *Monitor {
return &Monitor{context, clusterName, make(map[int]time.Time)}
}
// Run runs monitoring logic for osds status at set intervals
func (m *Monitor) Start(stopCh chan struct{}) {
for {
select {
case <-time.After(healthCheckInterval):
logger.Debug("Checking osd processes status.")
err := m.osdStatus()
if err != nil {
logger.Warningf("Failed OSD status check: %+v", err)
}
case <-stopCh:
logger.Infof("Stopping monitoring of OSDs in namespace %s", m.clusterName)
return
}
}
}
// OSDStatus validates osd dump output
func (m *Monitor) osdStatus() error {
logger.Debugf("OSDs with previously detected Down status: %+v", m.lastStatus)
osdDump, err := client.GetOSDDump(m.context, m.clusterName)
if err != nil {
return err
}
logger.Debugf("osd dump %v", osdDump)
evalDownStatus := func(id int) {
if now := time.Now(); now.Sub(m.lastStatus[id]) > osdGracePeriod {
logger.Warningf("osd.%d has been down for longer than the grace period (down since %+v)", id, m.lastStatus[id])
m.lastStatus[id] = time.Now()
} else {
logger.Warningf("waiting for the osd.%d to exceed the grace period", id)
}
}
for _, osdStatus := range osdDump.OSDs {
id64, err := osdStatus.OSD.Int64()
if err != nil {
continue
}
id := int(id64)
logger.Debugf("validating status of osd.%d", id)
_, tracked := m.lastStatus[id]
// No action on in/out cluster state is taken at this time.
status, _, err := osdDump.StatusByID(int64(id))
if err != nil {
return err
}
if status != upStatus {
logger.Infof("osd.%d is marked 'DOWN'", id)
if tracked {
evalDownStatus(id)
} else {
m.lastStatus[id] = time.Now()
}
} else {
logger.Debugf("osd.%d is healthy.", id)
if tracked {
logger.Debugf("osd.%d recovered, stopping tracking.", id)
delete(m.lastStatus, id)
}
}
}
return nil
}