diff --git a/.travis.yml b/.travis.yml index a4fa336..bf71152 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,6 +9,8 @@ env: - DOCKER_TAG=$TRAVIS_TAG before_install: + - wget -q -O- 'https://download.ceph.com/keys/release.asc' | sudo apt-key add - + - echo deb https://download.ceph.com/debian-luminous/ $(lsb_release -sc) main | sudo tee /etc/apt/sources.list.d/ceph.list - sudo apt-get update - sudo apt-get install -y librados-dev librbd-dev diff --git a/Dockerfile b/Dockerfile index 182cea2..d0f501b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,9 +7,10 @@ ENV PATH $GOROOT/bin:$PATH ENV APPLOC $GOPATH/src/github.com/digitalocean/ceph_exporter RUN apt-get update && \ - apt-get install -y apt-transport-https build-essential git curl + apt-get install -y apt-transport-https build-essential git curl wget -RUN echo "deb https://download.ceph.com/debian-jewel xenial main" >> /etc/apt/sources.list +RUN wget -q -O- 'https://download.ceph.com/keys/release.asc' | apt-key add - +RUN echo "deb https://download.ceph.com/debian-luminous xenial main" >> /etc/apt/sources.list RUN apt-get update && \ apt-get install -y --force-yes librados-dev librbd-dev @@ -28,8 +29,9 @@ FROM ubuntu:16.04 MAINTAINER Vaibhav Bhembre RUN apt-get update && \ - apt-get install -y apt-transport-https curl && \ - echo "deb https://download.ceph.com/debian-jewel xenial main" >> /etc/apt/sources.list && \ + apt-get install -y apt-transport-https curl wget +RUN wget -q -O- 'https://download.ceph.com/keys/release.asc' | apt-key add - +RUN echo "deb https://download.ceph.com/debian-luminous xenial main" >> /etc/apt/sources.list && \ apt-get update && \ apt-get install -y --force-yes librados2 librbd1 && \ rm -rf /var/lib/apt/lists/* diff --git a/collectors/health.go b/collectors/health.go index c9fb568..ee7922d 100644 --- a/collectors/health.go +++ b/collectors/health.go @@ -21,6 +21,7 @@ import ( "fmt" "log" "regexp" + "sort" "strconv" "strings" @@ -102,8 +103,13 @@ type ClusterHealthCollector struct { DeepScrubbingPGs prometheus.Gauge // SlowRequests depicts no. of total slow requests in the cluster + // This stat exists only for backwards compatbility. SlowRequests prometheus.Gauge + // SlowRequestsByOSD depicts no. of total slow requests in the cluster + // labelled by OSD + SlowRequestsByOSD *prometheus.GaugeVec + // DegradedObjectsCount gives the no. of RADOS objects are constitute the degraded PGs. // This includes object replicas in its count. DegradedObjectsCount prometheus.Gauge @@ -227,6 +233,15 @@ func NewClusterHealthCollector(conn Conn, cluster string) *ClusterHealthCollecto ConstLabels: labels, }, ), + SlowRequestsByOSD: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: cephNamespace, + Name: "slow_requests_osd", + Help: "No. of slow requests", + ConstLabels: labels, + }, + []string{"osd"}, + ), DegradedPGs: prometheus.NewGauge( prometheus.GaugeOpts{ Namespace: cephNamespace, @@ -446,6 +461,12 @@ func NewClusterHealthCollector(conn Conn, cluster string) *ClusterHealthCollecto } } +func (c *ClusterHealthCollector) collectorList() []prometheus.Collector { + return []prometheus.Collector{ + c.SlowRequestsByOSD, + } +} + func (c *ClusterHealthCollector) metricsList() []prometheus.Metric { return []prometheus.Metric{ c.HealthStatus, @@ -527,6 +548,18 @@ type cephHealthStats struct { } `json:"pgmap"` } +type cephHealthDetailStats struct { + Checks map[string]struct { + Details []struct { + Message string `json:"message"` + } `json:"detail"` + Summary struct { + Message string `json:"message"` + } `json:"summary"` + Severity string `json:"severity"` + } `json:"checks"` +} + func (c *ClusterHealthCollector) collect() error { cmd := c.cephJSONUsage() buf, _, err := c.conn.MonCommand(cmd) @@ -769,6 +802,105 @@ func (c *ClusterHealthCollector) collect() error { c.RemappedPGs.Set(stats.OSDMap.OSDMap.NumRemappedPGs) c.TotalPGs.Set(stats.PGMap.NumPGs) + cmd = c.cephHealthDetailCommand() + buf, _, err = c.conn.MonCommand(cmd) + if err != nil { + return err + } + + hdstats := &cephHealthDetailStats{} + if err := json.Unmarshal(buf, hdstats); err != nil { + return err + } + + var ( + slowOpsBlockedRegex = regexp.MustCompile(`([\d]+) ops are blocked > ([\d\.]+) sec`) + slowRequestSingleOSDRegex = regexp.MustCompile(`osd.([\d]+) has blocked requests > ([\d\.]+) sec`) + slowRequestMultipleOSDRegex = regexp.MustCompile(`osds ([\d,]+) have blocked requests > ([\d\.]+) sec`) + + secToOpsBlocked = make(map[float64]int) + osdToSecondsBlocked = make(map[int]float64) + ) + + for key, check := range hdstats.Checks { + if key == "REQUEST_SLOW" { + for _, detail := range check.Details { + matched := slowOpsBlockedRegex.FindStringSubmatch(detail.Message) + if len(matched) == 3 { + v, err := strconv.Atoi(matched[1]) + if err != nil { + return err + } + + f, err := strconv.ParseFloat(matched[2], 64) + if err != nil { + return err + } + + secToOpsBlocked[f] = v + continue + } + + matched = slowRequestSingleOSDRegex.FindStringSubmatch(detail.Message) + if len(matched) == 3 { + v, err := strconv.Atoi(matched[1]) + if err != nil { + return err + } + + f, err := strconv.ParseFloat(matched[2], 64) + if err != nil { + return err + } + + osdToSecondsBlocked[v] = f + continue + } + + matched = slowRequestMultipleOSDRegex.FindStringSubmatch(detail.Message) + if len(matched) == 3 { + f, err := strconv.ParseFloat(matched[2], 64) + if err != nil { + return err + } + + for _, osdID := range strings.Split(matched[1], ",") { + oid, err := strconv.Atoi(osdID) + if err != nil { + return err + } + + osdToSecondsBlocked[oid] = f + } + continue + } + } + } + } + + secs := make([]float64, len(secToOpsBlocked)) + for sec := range secToOpsBlocked { + secs = append(secs, sec) + } + sort.Float64s(secs) + + totalOpsUntilNow := 0 + totalOpsSet := false + for _, sec := range secs { + totalOpsUntilNow += secToOpsBlocked[sec] + for osd, osec := range osdToSecondsBlocked { + if sec == osec { + c.SlowRequestsByOSD.WithLabelValues(strconv.Itoa(osd)).Set(float64(totalOpsUntilNow)) + totalOpsSet = true + } + } + + if totalOpsSet { + totalOpsUntilNow = 0 + totalOpsSet = false + } + } + return nil } @@ -800,6 +932,20 @@ func (c *ClusterHealthCollector) cephUsageCommand(f format) []byte { return cmd } +func (c *ClusterHealthCollector) cephHealthDetailCommand() []byte { + cmd, err := json.Marshal(map[string]interface{}{ + "prefix": "health", + "detail": "detail", + "format": jsonFormat, + }) + if err != nil { + // panic! because ideally in no world this hard-coded input + // should fail. + panic(err) + } + return cmd +} + func (c *ClusterHealthCollector) collectRecoveryClientIO() error { cmd := c.cephPlainUsage() buf, _, err := c.conn.MonCommand(cmd) @@ -1036,6 +1182,10 @@ func (c *ClusterHealthCollector) collectCacheIO(clientStr string) error { // Describe sends all the descriptions of individual metrics of ClusterHealthCollector // to the provided prometheus channel. func (c *ClusterHealthCollector) Describe(ch chan<- *prometheus.Desc) { + for _, metric := range c.collectorList() { + metric.Describe(ch) + } + for _, metric := range c.metricsList() { ch <- metric.Desc() } @@ -1052,6 +1202,10 @@ func (c *ClusterHealthCollector) Collect(ch chan<- prometheus.Metric) { log.Println("failed collecting cluster recovery/client io:", err) } + for _, metric := range c.collectorList() { + metric.Collect(ch) + } + for _, metric := range c.metricsList() { ch <- metric } diff --git a/collectors/health_test.go b/collectors/health_test.go index 1fb5fb6..65c53e8 100644 --- a/collectors/health_test.go +++ b/collectors/health_test.go @@ -440,6 +440,88 @@ $ sudo ceph -s }, { input: ` +{ + "checks": { + "REQUEST_SLOW": { + "severity": "HEALTH_WARN", + "summary": { + "message": "286 slow requests are blocked > 32 sec" + }, + "detail": [ + { + "message": "102 ops are blocked > 524.288 sec" + }, + { + "message": "84 ops are blocked > 262.144 sec" + }, + { + "message": "53 ops are blocked > 131.072 sec" + }, + { + "message": "33 ops are blocked > 65.536 sec" + }, + { + "message": "14 ops are blocked > 32.768 sec" + }, + { + "message": "osds 363,463 have blocked requests > 32.768 sec" + }, + { + "message": "osd.349 has blocked requests > 524.288 sec" + } + ] + } + } +}`, + regexes: []*regexp.Regexp{ + regexp.MustCompile(`slow_requests_osd{cluster="ceph",osd="363"} 14`), + regexp.MustCompile(`slow_requests_osd{cluster="ceph",osd="463"} 14`), + regexp.MustCompile(`slow_requests_osd{cluster="ceph",osd="349"} 272`), + }, + }, + { + input: ` +{ + "checks": { + "REQUEST_SLOW": { + "severity": "HEALTH_WARN", + "summary": { + "message": "286 slow requests are blocked > 32 sec" + }, + "detail": [ + { + "message": "102 ops are blocked > 524.288 sec" + }, + { + "message": "84 ops are blocked > 262.144 sec" + }, + { + "message": "53 ops are blocked > 131.072 sec" + }, + { + "message": "33 ops are blocked > 65.536 sec" + }, + { + "message": "14 ops are blocked > 32.768 sec" + }, + { + "message": "osds 363,463 have blocked requests > 131.072 sec" + }, + { + "message": "osd.349 has blocked requests > 524.288 sec" + } + ] + } + } +}`, + regexes: []*regexp.Regexp{ + regexp.MustCompile(`slow_requests_osd{cluster="ceph",osd="363"} 100`), + regexp.MustCompile(`slow_requests_osd{cluster="ceph",osd="463"} 100`), + regexp.MustCompile(`slow_requests_osd{cluster="ceph",osd="349"} 186`), + }, + }, + { + input: ` { "pgmap": { "write_op_per_sec": 500,