Skip to content

Commit

Permalink
collector/health: add stuck request metric
Browse files Browse the repository at this point in the history
  • Loading branch information
neurodrone committed May 23, 2019
1 parent 8cc817b commit 900c23c
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 13 deletions.
51 changes: 38 additions & 13 deletions collectors/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,10 @@ type ClusterHealthCollector struct {
// This stat exists only for backwards compatbility.
SlowRequests prometheus.Gauge

// StuckRequests depicts no. of total requests in the cluster
// that haven't been served for over an hour.
StuckRequests prometheus.Gauge

// SlowRequestsByOSDDesc depicts no. of total slow requests in the cluster
// labelled by OSD.
SlowRequestsByOSDDesc *prometheus.Desc
Expand Down Expand Up @@ -343,6 +347,14 @@ func NewClusterHealthCollector(conn Conn, cluster string) *ClusterHealthCollecto
ConstLabels: labels,
},
),
StuckRequests: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "stuck_requests",
Help: "No. of stuck requests",
ConstLabels: labels,
},
),
SlowRequestsByOSDDesc: prometheus.NewDesc(
fmt.Sprintf("%s_slow_requests_osd", cephNamespace),
"No. of slow requests per OSD",
Expand Down Expand Up @@ -696,6 +708,7 @@ func (c *ClusterHealthCollector) metricsList() []prometheus.Metric {
c.ForcedBackfillPGs,
c.DownPGs,
c.SlowRequests,
c.StuckRequests,
c.DegradedObjectsCount,
c.MisplacedObjectsCount,
c.OSDMapFlagFull,
Expand Down Expand Up @@ -827,19 +840,20 @@ func (c *ClusterHealthCollector) collect(ch chan<- prometheus.Metric) error {
}

var (
degradedRegex = regexp.MustCompile(`([\d]+) pgs degraded`)
stuckDegradedRegex = regexp.MustCompile(`([\d]+) pgs stuck degraded`)
uncleanRegex = regexp.MustCompile(`([\d]+) pgs unclean`)
stuckUncleanRegex = regexp.MustCompile(`([\d]+) pgs stuck unclean`)
undersizedRegex = regexp.MustCompile(`([\d]+) pgs undersized`)
stuckUndersizedRegex = regexp.MustCompile(`([\d]+) pgs stuck undersized`)
staleRegex = regexp.MustCompile(`([\d]+) pgs stale`)
stuckStaleRegex = regexp.MustCompile(`([\d]+) pgs stuck stale`)
slowRequestRegex = regexp.MustCompile(`([\d]+) requests are blocked`)
slowRequestRegexLuminous = regexp.MustCompile(`([\d]+) slow requests are blocked`)
degradedObjectsRegex = regexp.MustCompile(`([\d]+)/([\d]+) objects degraded`)
misplacedObjectsRegex = regexp.MustCompile(`([\d]+)/([\d]+) objects misplaced`)
osdmapFlagsRegex = regexp.MustCompile(`([^ ]+) flag\(s\) set`)
degradedRegex = regexp.MustCompile(`([\d]+) pgs degraded`)
stuckDegradedRegex = regexp.MustCompile(`([\d]+) pgs stuck degraded`)
uncleanRegex = regexp.MustCompile(`([\d]+) pgs unclean`)
stuckUncleanRegex = regexp.MustCompile(`([\d]+) pgs stuck unclean`)
undersizedRegex = regexp.MustCompile(`([\d]+) pgs undersized`)
stuckUndersizedRegex = regexp.MustCompile(`([\d]+) pgs stuck undersized`)
staleRegex = regexp.MustCompile(`([\d]+) pgs stale`)
stuckStaleRegex = regexp.MustCompile(`([\d]+) pgs stuck stale`)
slowRequestRegex = regexp.MustCompile(`([\d]+) requests are blocked`)
slowRequestRegexLuminous = regexp.MustCompile(`([\d]+) slow requests are blocked`)
stuckRequestRegexLuminous = regexp.MustCompile(`([\d]+) stuck requests are blocked`)
degradedObjectsRegex = regexp.MustCompile(`([\d]+)/([\d]+) objects degraded`)
misplacedObjectsRegex = regexp.MustCompile(`([\d]+)/([\d]+) objects misplaced`)
osdmapFlagsRegex = regexp.MustCompile(`([^ ]+) flag\(s\) set`)
)

for _, s := range stats.Health.Summary {
Expand Down Expand Up @@ -955,6 +969,17 @@ func (c *ClusterHealthCollector) collect(ch chan<- prometheus.Metric) error {
}
}

if k == "REQUEST_STUCK" {
matched := stuckRequestRegexLuminous.FindStringSubmatch(check.Summary.Message)
if len(matched) == 2 {
v, err := strconv.Atoi(matched[1])
if err != nil {
return err
}
c.StuckRequests.Set(float64(v))
}
}

if k == "PG_DEGRADED" {
matched := degradedObjectsRegex.FindStringSubmatch(check.Summary.Message)
if len(matched) == 3 {
Expand Down
18 changes: 18 additions & 0 deletions collectors/health_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,24 @@ $ sudo ceph -s
},
{
input: `
{
"health": {
"checks": {
"REQUEST_STUCK": {
"severity": "HEALTH_WARN",
"summary": {
"message": "125 stuck requests are blocked > 4194.3 sec"
}
}
}
}
}`,
regexes: []*regexp.Regexp{
regexp.MustCompile(`stuck_requests{cluster="ceph"} 125`),
},
},
{
input: `
{
"health": {
"checks": {
Expand Down

0 comments on commit 900c23c

Please sign in to comment.