Skip to content

Commit

Permalink
health: capture slow request per osd
Browse files Browse the repository at this point in the history
  • Loading branch information
neurodrone committed May 10, 2018
1 parent afd5a2c commit 219fb69
Show file tree
Hide file tree
Showing 4 changed files with 244 additions and 4 deletions.
2 changes: 2 additions & 0 deletions .travis.yml
Expand Up @@ -9,6 +9,8 @@ env:
- DOCKER_TAG=$TRAVIS_TAG

before_install:
- wget -q -O- 'https://download.ceph.com/keys/release.asc' | sudo apt-key add -
- echo deb https://download.ceph.com/debian-luminous/ $(lsb_release -sc) main | sudo tee /etc/apt/sources.list.d/ceph.list
- sudo apt-get update
- sudo apt-get install -y librados-dev librbd-dev

Expand Down
10 changes: 6 additions & 4 deletions Dockerfile
Expand Up @@ -7,9 +7,10 @@ ENV PATH $GOROOT/bin:$PATH
ENV APPLOC $GOPATH/src/github.com/digitalocean/ceph_exporter

RUN apt-get update && \
apt-get install -y apt-transport-https build-essential git curl
apt-get install -y apt-transport-https build-essential git curl wget

RUN echo "deb https://download.ceph.com/debian-jewel xenial main" >> /etc/apt/sources.list
RUN wget -q -O- 'https://download.ceph.com/keys/release.asc' | apt-key add -
RUN echo "deb https://download.ceph.com/debian-luminous xenial main" >> /etc/apt/sources.list

RUN apt-get update && \
apt-get install -y --force-yes librados-dev librbd-dev
Expand All @@ -28,8 +29,9 @@ FROM ubuntu:16.04
MAINTAINER Vaibhav Bhembre <vaibhav@digitalocean.com>

RUN apt-get update && \
apt-get install -y apt-transport-https curl && \
echo "deb https://download.ceph.com/debian-jewel xenial main" >> /etc/apt/sources.list && \
apt-get install -y apt-transport-https curl wget
RUN wget -q -O- 'https://download.ceph.com/keys/release.asc' | apt-key add -
RUN echo "deb https://download.ceph.com/debian-luminous xenial main" >> /etc/apt/sources.list && \
apt-get update && \
apt-get install -y --force-yes librados2 librbd1 && \
rm -rf /var/lib/apt/lists/*
Expand Down
154 changes: 154 additions & 0 deletions collectors/health.go
Expand Up @@ -21,6 +21,7 @@ import (
"fmt"
"log"
"regexp"
"sort"
"strconv"
"strings"

Expand Down Expand Up @@ -102,8 +103,13 @@ type ClusterHealthCollector struct {
DeepScrubbingPGs prometheus.Gauge

// SlowRequests depicts no. of total slow requests in the cluster
// This stat exists only for backwards compatbility.
SlowRequests prometheus.Gauge

// SlowRequestsByOSD depicts no. of total slow requests in the cluster
// labelled by OSD
SlowRequestsByOSD *prometheus.GaugeVec

// DegradedObjectsCount gives the no. of RADOS objects are constitute the degraded PGs.
// This includes object replicas in its count.
DegradedObjectsCount prometheus.Gauge
Expand Down Expand Up @@ -227,6 +233,15 @@ func NewClusterHealthCollector(conn Conn, cluster string) *ClusterHealthCollecto
ConstLabels: labels,
},
),
SlowRequestsByOSD: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "slow_requests_osd",
Help: "No. of slow requests",
ConstLabels: labels,
},
[]string{"osd"},
),
DegradedPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Expand Down Expand Up @@ -446,6 +461,12 @@ func NewClusterHealthCollector(conn Conn, cluster string) *ClusterHealthCollecto
}
}

func (c *ClusterHealthCollector) collectorList() []prometheus.Collector {
return []prometheus.Collector{
c.SlowRequestsByOSD,
}
}

func (c *ClusterHealthCollector) metricsList() []prometheus.Metric {
return []prometheus.Metric{
c.HealthStatus,
Expand Down Expand Up @@ -527,6 +548,18 @@ type cephHealthStats struct {
} `json:"pgmap"`
}

type cephHealthDetailStats struct {
Checks map[string]struct {
Details []struct {
Message string `json:"message"`
} `json:"detail"`
Summary struct {
Message string `json:"message"`
} `json:"summary"`
Severity string `json:"severity"`
} `json:"checks"`
}

func (c *ClusterHealthCollector) collect() error {
cmd := c.cephJSONUsage()
buf, _, err := c.conn.MonCommand(cmd)
Expand Down Expand Up @@ -769,6 +802,105 @@ func (c *ClusterHealthCollector) collect() error {
c.RemappedPGs.Set(stats.OSDMap.OSDMap.NumRemappedPGs)
c.TotalPGs.Set(stats.PGMap.NumPGs)

cmd = c.cephHealthDetailCommand()
buf, _, err = c.conn.MonCommand(cmd)
if err != nil {
return err
}

hdstats := &cephHealthDetailStats{}
if err := json.Unmarshal(buf, hdstats); err != nil {
return err
}

var (
slowOpsBlockedRegex = regexp.MustCompile(`([\d]+) ops are blocked > ([\d\.]+) sec`)
slowRequestSingleOSDRegex = regexp.MustCompile(`osd.([\d]+) has blocked requests > ([\d\.]+) sec`)
slowRequestMultipleOSDRegex = regexp.MustCompile(`osds ([\d,]+) have blocked requests > ([\d\.]+) sec`)

secToOpsBlocked = make(map[float64]int)
osdToSecondsBlocked = make(map[int]float64)
)

for key, check := range hdstats.Checks {
if key == "REQUEST_SLOW" {
for _, detail := range check.Details {
matched := slowOpsBlockedRegex.FindStringSubmatch(detail.Message)
if len(matched) == 3 {
v, err := strconv.Atoi(matched[1])
if err != nil {
return err
}

f, err := strconv.ParseFloat(matched[2], 64)
if err != nil {
return err
}

secToOpsBlocked[f] = v
continue
}

matched = slowRequestSingleOSDRegex.FindStringSubmatch(detail.Message)
if len(matched) == 3 {
v, err := strconv.Atoi(matched[1])
if err != nil {
return err
}

f, err := strconv.ParseFloat(matched[2], 64)
if err != nil {
return err
}

osdToSecondsBlocked[v] = f
continue
}

matched = slowRequestMultipleOSDRegex.FindStringSubmatch(detail.Message)
if len(matched) == 3 {
f, err := strconv.ParseFloat(matched[2], 64)
if err != nil {
return err
}

for _, osdID := range strings.Split(matched[1], ",") {
oid, err := strconv.Atoi(osdID)
if err != nil {
return err
}

osdToSecondsBlocked[oid] = f
}
continue
}
}
}
}

secs := make([]float64, len(secToOpsBlocked))
for sec := range secToOpsBlocked {
secs = append(secs, sec)
}
sort.Float64s(secs)

totalOpsUntilNow := 0
totalOpsSet := false
for _, sec := range secs {
totalOpsUntilNow += secToOpsBlocked[sec]
for osd, osec := range osdToSecondsBlocked {
if sec == osec {
c.SlowRequestsByOSD.WithLabelValues(strconv.Itoa(osd)).Set(float64(totalOpsUntilNow))
totalOpsSet = true
}
}

if totalOpsSet {
totalOpsUntilNow = 0
totalOpsSet = false
}
}

return nil
}

Expand Down Expand Up @@ -800,6 +932,20 @@ func (c *ClusterHealthCollector) cephUsageCommand(f format) []byte {
return cmd
}

func (c *ClusterHealthCollector) cephHealthDetailCommand() []byte {
cmd, err := json.Marshal(map[string]interface{}{
"prefix": "health",
"detail": "detail",
"format": jsonFormat,
})
if err != nil {
// panic! because ideally in no world this hard-coded input
// should fail.
panic(err)
}
return cmd
}

func (c *ClusterHealthCollector) collectRecoveryClientIO() error {
cmd := c.cephPlainUsage()
buf, _, err := c.conn.MonCommand(cmd)
Expand Down Expand Up @@ -1036,6 +1182,10 @@ func (c *ClusterHealthCollector) collectCacheIO(clientStr string) error {
// Describe sends all the descriptions of individual metrics of ClusterHealthCollector
// to the provided prometheus channel.
func (c *ClusterHealthCollector) Describe(ch chan<- *prometheus.Desc) {
for _, metric := range c.collectorList() {
metric.Describe(ch)
}

for _, metric := range c.metricsList() {
ch <- metric.Desc()
}
Expand All @@ -1052,6 +1202,10 @@ func (c *ClusterHealthCollector) Collect(ch chan<- prometheus.Metric) {
log.Println("failed collecting cluster recovery/client io:", err)
}

for _, metric := range c.collectorList() {
metric.Collect(ch)
}

for _, metric := range c.metricsList() {
ch <- metric
}
Expand Down
82 changes: 82 additions & 0 deletions collectors/health_test.go
Expand Up @@ -440,6 +440,88 @@ $ sudo ceph -s
},
{
input: `
{
"checks": {
"REQUEST_SLOW": {
"severity": "HEALTH_WARN",
"summary": {
"message": "286 slow requests are blocked > 32 sec"
},
"detail": [
{
"message": "102 ops are blocked > 524.288 sec"
},
{
"message": "84 ops are blocked > 262.144 sec"
},
{
"message": "53 ops are blocked > 131.072 sec"
},
{
"message": "33 ops are blocked > 65.536 sec"
},
{
"message": "14 ops are blocked > 32.768 sec"
},
{
"message": "osds 363,463 have blocked requests > 32.768 sec"
},
{
"message": "osd.349 has blocked requests > 524.288 sec"
}
]
}
}
}`,
regexes: []*regexp.Regexp{
regexp.MustCompile(`slow_requests_osd{cluster="ceph",osd="363"} 14`),
regexp.MustCompile(`slow_requests_osd{cluster="ceph",osd="463"} 14`),
regexp.MustCompile(`slow_requests_osd{cluster="ceph",osd="349"} 272`),
},
},
{
input: `
{
"checks": {
"REQUEST_SLOW": {
"severity": "HEALTH_WARN",
"summary": {
"message": "286 slow requests are blocked > 32 sec"
},
"detail": [
{
"message": "102 ops are blocked > 524.288 sec"
},
{
"message": "84 ops are blocked > 262.144 sec"
},
{
"message": "53 ops are blocked > 131.072 sec"
},
{
"message": "33 ops are blocked > 65.536 sec"
},
{
"message": "14 ops are blocked > 32.768 sec"
},
{
"message": "osds 363,463 have blocked requests > 131.072 sec"
},
{
"message": "osd.349 has blocked requests > 524.288 sec"
}
]
}
}
}`,
regexes: []*regexp.Regexp{
regexp.MustCompile(`slow_requests_osd{cluster="ceph",osd="363"} 100`),
regexp.MustCompile(`slow_requests_osd{cluster="ceph",osd="463"} 100`),
regexp.MustCompile(`slow_requests_osd{cluster="ceph",osd="349"} 186`),
},
},
{
input: `
{
"pgmap": {
"write_op_per_sec": 500,
Expand Down

0 comments on commit 219fb69

Please sign in to comment.