diff --git a/collectors/cluster_usage.go b/collectors/cluster_usage.go index 0860741..6a48a0d 100644 --- a/collectors/cluster_usage.go +++ b/collectors/cluster_usage.go @@ -16,42 +16,46 @@ package collectors import ( "encoding/json" - "log" "github.com/prometheus/client_golang/prometheus" + "github.com/sirupsen/logrus" ) const ( cephNamespace = "ceph" ) -// A ClusterUsageCollector is used to gather all the global stats about a given -// ceph cluster. It is sometimes essential to know how fast the cluster is growing -// or shrinking as a whole in order to zero in on the cause. The pool specific -// stats are provided separately. +// A ClusterUsageCollector is used to gather all the global stats about a +// given ceph cluster. It is sometimes essential to know how fast the cluster +// is growing or shrinking as a whole in order to zero in on the cause. The +// pool specific stats are provided separately. type ClusterUsageCollector struct { - conn Conn + conn Conn + logger *logrus.Logger // GlobalCapacity displays the total storage capacity of the cluster. This - // information is based on the actual no. of objects that are allocated. It - // does not take overcommitment into consideration. + // information is based on the actual no. of objects that are + // allocated. It does not take overcommitment into consideration. GlobalCapacity prometheus.Gauge // UsedCapacity shows the storage under use. UsedCapacity prometheus.Gauge - // AvailableCapacity shows the remaining capacity of the cluster that is left unallocated. + // AvailableCapacity shows the remaining capacity of the cluster that is + // left unallocated. AvailableCapacity prometheus.Gauge } -// NewClusterUsageCollector creates and returns the reference to ClusterUsageCollector -// and internally defines each metric that display cluster stats. -func NewClusterUsageCollector(conn Conn, cluster string) *ClusterUsageCollector { +// NewClusterUsageCollector creates and returns the reference to +// ClusterUsageCollector and internally defines each metric that display +// cluster stats. +func NewClusterUsageCollector(conn Conn, cluster string, logger *logrus.Logger) *ClusterUsageCollector { labels := make(prometheus.Labels) labels["cluster"] = cluster return &ClusterUsageCollector{ - conn: conn, + conn: conn, + logger: logger, GlobalCapacity: prometheus.NewGauge(prometheus.GaugeOpts{ Namespace: cephNamespace, @@ -94,6 +98,10 @@ func (c *ClusterUsageCollector) collect() error { cmd := c.cephUsageCommand() buf, _, err := c.conn.MonCommand(cmd) if err != nil { + c.logger.WithError(err).WithField( + "args", string(cmd), + ).Error("error executing mon command") + return err } @@ -106,17 +114,17 @@ func (c *ClusterUsageCollector) collect() error { totBytes, err = stats.Stats.TotalBytes.Float64() if err != nil { - log.Println("[ERROR] cannot extract total bytes:", err) + c.logger.WithError(err).Error("error extracting total bytes") } usedBytes, err = stats.Stats.TotalUsedBytes.Float64() if err != nil { - log.Println("[ERROR] cannot extract used bytes:", err) + c.logger.WithError(err).Error("error extracting used bytes") } availBytes, err = stats.Stats.TotalAvailBytes.Float64() if err != nil { - log.Println("[ERROR] cannot extract available bytes:", err) + c.logger.WithError(err).Error("error extracting available bytes") } c.GlobalCapacity.Set(totBytes) @@ -135,7 +143,7 @@ func (c *ClusterUsageCollector) cephUsageCommand() []byte { if err != nil { // panic! because ideally in no world this hard-coded input // should fail. - panic(err) + c.logger.WithError(err).Panic("error marshalling ceph df detail") } return cmd } @@ -151,8 +159,9 @@ func (c *ClusterUsageCollector) Describe(ch chan<- *prometheus.Desc) { // Collect sends the metric values for each metric pertaining to the global // cluster usage over to the provided prometheus Metric channel. func (c *ClusterUsageCollector) Collect(ch chan<- prometheus.Metric) { + c.logger.Debug("collecting cluster usage metrics") if err := c.collect(); err != nil { - log.Println("[ERROR] failed collecting cluster usage metrics:", err) + c.logger.WithError(err).Error("error collecting cluster usage metrics") return } diff --git a/collectors/cluster_usage_test.go b/collectors/cluster_usage_test.go index 548e965..7c34da8 100644 --- a/collectors/cluster_usage_test.go +++ b/collectors/cluster_usage_test.go @@ -16,7 +16,6 @@ package collectors import ( "io/ioutil" - "log" "net/http" "net/http/httptest" "regexp" @@ -24,11 +23,10 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/sirupsen/logrus" ) func TestClusterUsage(t *testing.T) { - log.SetOutput(ioutil.Discard) - for _, tt := range []struct { input string reMatch, reUnmatch []*regexp.Regexp @@ -128,7 +126,7 @@ func TestClusterUsage(t *testing.T) { }, } { func() { - collector := NewClusterUsageCollector(NewNoopConn(tt.input), "ceph") + collector := NewClusterUsageCollector(NewNoopConn(tt.input), "ceph", logrus.New()) if err := prometheus.Register(collector); err != nil { t.Fatalf("collector failed to register: %s", err) } diff --git a/collectors/conn.go b/collectors/conn.go index eb16203..ee624e5 100644 --- a/collectors/conn.go +++ b/collectors/conn.go @@ -22,62 +22,140 @@ import ( "time" "github.com/ceph/go-ceph/rados" + "github.com/sirupsen/logrus" ) -// Conn interface implements only necessary methods that are used -// in this repository of *rados.Conn. This keeps rest of the implementation -// clean and *rados.Conn doesn't need to show up everywhere (it being -// more of an implementation detail in reality). Also it makes mocking -// easier for unit-testing the collectors. +// Conn interface implements only necessary methods that are used in this +// repository on top of *rados.Conn. This keeps rest of the implementation +// clean and *rados.Conn doesn't need to show up everywhere (it being more of +// an implementation detail in reality). Also it makes mocking easier for +// unit-testing the collectors. type Conn interface { - ReadDefaultConfigFile() error - Connect() error - Shutdown() MonCommand([]byte) ([]byte, string, error) - OpenIOContext(string) (*rados.IOContext, error) + GetPoolStats(string) (*rados.PoolStat, error) } -// Verify that *rados.Conn implements Conn correctly. -var _ Conn = &rados.Conn{} +// RadosConn implements the Conn interface with the underlying *rados.Conn +// that talks to a real Ceph cluster. +type RadosConn struct { + user string + configFile string + timeout time.Duration + logger *logrus.Logger +} -// CreateRadosConn creates an established rados connection to the Ceph cluster +// *RadosConn must implement the Conn. +var _ Conn = &RadosConn{} + +// NewRadosConn returns a new RadosConn. Unlike the native rados.Conn, there +// is no need to manage the connection before/after talking to the rados; it +// is the responsibility of this *RadosConn to manage the connection. +func NewRadosConn(user, configFile string, timeout time.Duration, logger *logrus.Logger) *RadosConn { + return &RadosConn{ + user: user, + configFile: configFile, + timeout: timeout, + logger: logger, + } +} + +// newRadosConn creates an established rados connection to the Ceph cluster // using the provided Ceph user and configFile. Ceph parameters // rados_osd_op_timeout and rados_mon_op_timeout are specified by the timeout // value, where 0 means no limit. -func CreateRadosConn(user, configFile string, timeout time.Duration) (*rados.Conn, error) { - conn, err := rados.NewConnWithUser(user) +func (c *RadosConn) newRadosConn() (*rados.Conn, error) { + conn, err := rados.NewConnWithUser(c.user) if err != nil { - return nil, fmt.Errorf("cannot create new ceph connection: %s", err) + return nil, fmt.Errorf("error creating rados connection: %s", err) } - err = conn.ReadConfigFile(configFile) + err = conn.ReadConfigFile(c.configFile) if err != nil { - return nil, fmt.Errorf("cannot read ceph config file: %s", err) + return nil, fmt.Errorf("error reading config file: %s", err) } - tv := strconv.FormatFloat(timeout.Seconds(), 'f', -1, 64) + tv := strconv.FormatFloat(c.timeout.Seconds(), 'f', -1, 64) // Set rados_osd_op_timeout and rados_mon_op_timeout to avoid Mon // and PG command hang. // See // https://github.com/ceph/ceph/blob/d4872ce97a2825afcb58876559cc73aaa1862c0f/src/common/legacy_config_opts.h#L1258-L1259 err = conn.SetConfigOption("rados_osd_op_timeout", tv) if err != nil { - return nil, fmt.Errorf("cannot set rados_osd_op_timeout for ceph cluster: %s", err) + return nil, fmt.Errorf("error setting rados_osd_op_timeout: %s", err) } err = conn.SetConfigOption("rados_mon_op_timeout", tv) if err != nil { - return nil, fmt.Errorf("cannot set rados_mon_op_timeout for ceph cluster: %s", err) + return nil, fmt.Errorf("error setting rados_mon_op_timeout: %s", err) } err = conn.Connect() if err != nil { - return nil, fmt.Errorf("cannot connect to ceph cluster: %s", err) + return nil, fmt.Errorf("error connecting to rados: %s", err) } return conn, nil } +// MonCommand executes a monitor command to rados. +func (c *RadosConn) MonCommand(args []byte) (buffer []byte, info string, err error) { + ll := c.logger.WithField("args", string(args)) + + ll.Trace("creating rados connection to execute mon command") + + conn, err := c.newRadosConn() + if err != nil { + return nil, "", err + } + defer conn.Shutdown() + + ll = ll.WithField("conn", conn.GetInstanceID()) + + ll.Trace("start executing mon command") + + buffer, info, err = conn.MonCommand(args) + + ll.WithError(err).Trace("complete executing mon command") + + return +} + +// GetPoolStats returns a *rados.PoolStat for the given rados pool. +func (c *RadosConn) GetPoolStats(pool string) (stat *rados.PoolStat, err error) { + ll := c.logger.WithField("pool", pool) + + ll.Trace("creating rados connection to get pool stats") + + conn, err := c.newRadosConn() + if err != nil { + return nil, err + } + defer conn.Shutdown() + + ll = ll.WithField("conn", conn.GetInstanceID()) + + ll.Trace("opening IOContext for pool") + + ioCtx, err := conn.OpenIOContext(pool) + if err != nil { + return nil, err + } + defer ioCtx.Destroy() + + ll.Trace("start getting pool stats") + + st, err := ioCtx.GetPoolStats() + if err != nil { + stat = nil + } else { + stat = &st + } + + ll.WithError(err).Trace("complete getting pool stats") + + return +} + // NoopConn is the stub we use for mocking rados Conn. Unit testing // each individual collectors becomes a lot easier after that. // TODO: both output and cmdOut provide the command outputs for "go test", but @@ -114,23 +192,8 @@ func (n *NoopConn) IncIteration() { n.iteration++ } -// ReadDefaultConfigFile does not need to return an error. It satisfies -// rados.Conn's function with the same prototype. -func (n *NoopConn) ReadDefaultConfigFile() error { - return nil -} - -// Connect does not need to return an error. It satisfies -// rados.Conn's function with the same prototype. -func (n *NoopConn) Connect() error { - return nil -} - -// Shutdown satisfies rados.Conn's function prototype. -func (n *NoopConn) Shutdown() {} - -// MonCommand returns the provided output string to NoopConn as is, making -// it seem like it actually ran something and produced that string as a result. +// MonCommand returns the provided output string to NoopConn as is, making it +// seem like it actually ran something and produced that string as a result. func (n *NoopConn) MonCommand(args []byte) ([]byte, string, error) { // Unmarshal the input command and see if we need to intercept cmd := map[string]interface{}{} @@ -265,10 +328,7 @@ func (n *NoopConn) MonCommand(args []byte) ([]byte, string, error) { return []byte(n.output), "", nil } -// OpenIOContext always returns a nil rados.IOContext, and "not implemented" -// error. The OpenIOContext method in the rados package returns a pointer of -// rados.IOContext that contains an actual C.rados_ioctx_t, which is not -// available in this NoopConn. -func (n *NoopConn) OpenIOContext(pool string) (*rados.IOContext, error) { - return nil, errors.New("not implemented") +// GetPoolStats always returns a nil and "not implmemented" error. +func (n *NoopConn) GetPoolStats(pool string) (*rados.PoolStat, error) { + return nil, fmt.Errorf("not implemented") } diff --git a/collectors/health.go b/collectors/health.go index 4350f29..e755db0 100644 --- a/collectors/health.go +++ b/collectors/health.go @@ -19,12 +19,12 @@ import ( "bytes" "encoding/json" "fmt" - "log" "regexp" "strconv" "strings" "github.com/prometheus/client_golang/prometheus" + "github.com/sirupsen/logrus" ) var ( @@ -47,8 +47,8 @@ var ( // It surfaces changes in the ceph parameters unlike data usage that ClusterUsageCollector // does. type ClusterHealthCollector struct { - // conn holds connection to the Ceph cluster - conn Conn + conn Conn + logger *logrus.Logger // healthChecksMap stores warnings and their criticality healthChecksMap map[string]int @@ -258,12 +258,13 @@ const ( // NewClusterHealthCollector creates a new instance of ClusterHealthCollector to collect health // metrics on. -func NewClusterHealthCollector(conn Conn, cluster string) *ClusterHealthCollector { +func NewClusterHealthCollector(conn Conn, cluster string, logger *logrus.Logger) *ClusterHealthCollector { labels := make(prometheus.Labels) labels["cluster"] = cluster return &ClusterHealthCollector{ - conn: conn, + conn: conn, + logger: logger, healthChecksMap: map[string]int{ "AUTH_BAD_CAPS": 2, @@ -983,9 +984,13 @@ type cephHealthDetailStats struct { } func (c *ClusterHealthCollector) collect(ch chan<- prometheus.Metric) error { - cmd := c.cephJSONUsage() + cmd := c.cephUsageCommand(jsonFormat) buf, _, err := c.conn.MonCommand(cmd) if err != nil { + c.logger.WithError(err).WithField( + "args", string(cmd), + ).Error("error executing mon command") + return err } @@ -1335,23 +1340,13 @@ const ( plainFormat format = "plain" ) -func (c *ClusterHealthCollector) cephPlainUsage() []byte { - return c.cephUsageCommand(plainFormat) -} - -func (c *ClusterHealthCollector) cephJSONUsage() []byte { - return c.cephUsageCommand(jsonFormat) -} - func (c *ClusterHealthCollector) cephUsageCommand(f format) []byte { cmd, err := json.Marshal(map[string]interface{}{ "prefix": "status", "format": f, }) if err != nil { - // panic! because ideally in no world this hard-coded input - // should fail. - panic(err) + c.logger.WithError(err).Panic("error marshalling ceph status") } return cmd } @@ -1363,17 +1358,19 @@ func (c *ClusterHealthCollector) cephHealthDetailCommand() []byte { "format": jsonFormat, }) if err != nil { - // panic! because ideally in no world this hard-coded input - // should fail. - panic(err) + c.logger.WithError(err).Panic("error marshalling ceph health detail") } return cmd } func (c *ClusterHealthCollector) collectRecoveryClientIO() error { - cmd := c.cephPlainUsage() + cmd := c.cephUsageCommand(plainFormat) buf, _, err := c.conn.MonCommand(cmd) if err != nil { + c.logger.WithError(err).WithField( + "args", string(cmd), + ).Error("error executing mon command") + return err } @@ -1620,12 +1617,14 @@ func (c *ClusterHealthCollector) Describe(ch chan<- *prometheus.Desc) { // Collect sends all the collected metrics to the provided prometheus channel. // It requires the caller to handle synchronization. func (c *ClusterHealthCollector) Collect(ch chan<- prometheus.Metric) { + c.logger.Debug("collecting cluster health metrics") if err := c.collect(ch); err != nil { - log.Println("failed collecting cluster health metrics:", err) + c.logger.WithError(err).Error("error collecting cluster health metrics") } + c.logger.Debug("collecting cluster recovery/client I/O metrics") if err := c.collectRecoveryClientIO(); err != nil { - log.Println("failed collecting cluster recovery/client io:", err) + c.logger.WithError(err).Error("error collecting cluster recovery/client I/O metrics") } for _, metric := range c.metricsList() { diff --git a/collectors/health_test.go b/collectors/health_test.go index 55ab25b..290b30a 100644 --- a/collectors/health_test.go +++ b/collectors/health_test.go @@ -23,6 +23,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/sirupsen/logrus" ) func TestClusterHealthCollector(t *testing.T) { @@ -743,7 +744,7 @@ $ sudo ceph -s }, } { func() { - collector := NewClusterHealthCollector(NewNoopConn(tt.input), "ceph") + collector := NewClusterHealthCollector(NewNoopConn(tt.input), "ceph", logrus.New()) if err := prometheus.Register(collector); err != nil { t.Fatalf("collector failed to register: %s", err) } diff --git a/collectors/monitors.go b/collectors/monitors.go index da478db..1a98d3c 100644 --- a/collectors/monitors.go +++ b/collectors/monitors.go @@ -16,9 +16,9 @@ package collectors import ( "encoding/json" - "log" "github.com/prometheus/client_golang/prometheus" + "github.com/sirupsen/logrus" ) // MonitorCollector is used to extract stats related to monitors @@ -26,7 +26,8 @@ import ( // to each monitor instance, there are various vector metrics we // need to use. type MonitorCollector struct { - conn Conn + conn Conn + logger *logrus.Logger // TotalKBs display the total storage a given monitor node has. TotalKBs *prometheus.GaugeVec @@ -77,12 +78,13 @@ type Store struct { // NewMonitorCollector creates an instance of the MonitorCollector and instantiates // the individual metrics that show information about the monitor processes. -func NewMonitorCollector(conn Conn, cluster string) *MonitorCollector { +func NewMonitorCollector(conn Conn, cluster string, logger *logrus.Logger) *MonitorCollector { labels := make(prometheus.Labels) labels["cluster"] = cluster return &MonitorCollector{ - conn: conn, + conn: conn, + logger: logger, TotalKBs: prometheus.NewGaugeVec( prometheus.GaugeOpts{ @@ -252,6 +254,10 @@ func (m *MonitorCollector) collect() error { cmd := m.cephUsageCommand() buf, _, err := m.conn.MonCommand(cmd) if err != nil { + m.logger.WithError(err).WithField( + "args", string(cmd), + ).Error("error executing mon command") + return err } @@ -263,6 +269,10 @@ func (m *MonitorCollector) collect() error { cmd = m.cephTimeSyncStatusCommand() buf, _, err = m.conn.MonCommand(cmd) if err != nil { + m.logger.WithError(err).WithField( + "args", string(cmd), + ).Error("error executing mon command") + return err } @@ -370,9 +380,7 @@ func (m *MonitorCollector) cephUsageCommand() []byte { "format": "json", }) if err != nil { - // panic! because ideally in no world this hard-coded input - // should fail. - panic(err) + m.logger.WithError(err).Panic("error marshalling ceph status") } return cmd } @@ -383,9 +391,7 @@ func (m *MonitorCollector) cephTimeSyncStatusCommand() []byte { "format": "json", }) if err != nil { - // panic! because ideally in no world this hard-coded input - // should fail. - panic(err) + m.logger.WithError(err).Panic("error marshalling ceph time-sync-status") } return cmd } @@ -405,8 +411,9 @@ func (m *MonitorCollector) Describe(ch chan<- *prometheus.Desc) { // Collect extracts the given metrics from the Monitors and sends it to the prometheus // channel. func (m *MonitorCollector) Collect(ch chan<- prometheus.Metric) { + m.logger.Debug("collecting ceph monitor metrics") if err := m.collect(); err != nil { - log.Println("failed collecting monitor metrics:", err) + m.logger.WithError(err).Error("error collecting ceph monitor metrics") return } diff --git a/collectors/monitors_test.go b/collectors/monitors_test.go index 51fe528..e2c889c 100644 --- a/collectors/monitors_test.go +++ b/collectors/monitors_test.go @@ -23,6 +23,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/sirupsen/logrus" ) func TestMonitorCollector(t *testing.T) { @@ -263,7 +264,7 @@ func TestMonitorCollector(t *testing.T) { }, } { func() { - collector := NewMonitorCollector(NewNoopConn(tt.input), "ceph") + collector := NewMonitorCollector(NewNoopConn(tt.input), "ceph", logrus.New()) if err := prometheus.Register(collector); err != nil { t.Fatalf("collector failed to register: %s", err) } @@ -384,7 +385,7 @@ func TestMonitorTimeSyncStats(t *testing.T) { }, } { func() { - collector := NewMonitorCollector(NewNoopConn(tt.input), "ceph") + collector := NewMonitorCollector(NewNoopConn(tt.input), "ceph", logrus.New()) if err := prometheus.Register(collector); err != nil { t.Fatalf("collector failed to register: %s", err) } diff --git a/collectors/osd.go b/collectors/osd.go index 5ca4360..502a134 100644 --- a/collectors/osd.go +++ b/collectors/osd.go @@ -4,7 +4,6 @@ import ( "bytes" "encoding/json" "fmt" - "log" "math" "regexp" "strconv" @@ -12,6 +11,7 @@ import ( "strings" "github.com/prometheus/client_golang/prometheus" + "github.com/sirupsen/logrus" ) const ( @@ -28,7 +28,8 @@ const ( // An important aspect of monitoring OSDs is to ensure that when the cluster is // up and running that all OSDs that are in the cluster are up and running, too type OSDCollector struct { - conn Conn + conn Conn + logger *logrus.Logger // osdScrubCache holds the cache of previous PG scrubs osdScrubCache map[int]int @@ -121,13 +122,14 @@ var _ prometheus.Collector = &OSDCollector{} // NewOSDCollector creates an instance of the OSDCollector and instantiates the // individual metrics that show information about the OSD. -func NewOSDCollector(conn Conn, cluster string) *OSDCollector { +func NewOSDCollector(conn Conn, cluster string, logger *logrus.Logger) *OSDCollector { labels := make(prometheus.Labels) labels["cluster"] = cluster osdLabels := []string{"osd", "device_class", "host", "rack", "root"} return &OSDCollector{ - conn: conn, + conn: conn, + logger: logger, osdScrubCache: make(map[int]int), osdLabelsCache: make(map[int64]*cephOSDLabel), @@ -542,10 +544,12 @@ func (c cephPGQuery) backfillTargets() map[int64]int64 { func (o *OSDCollector) collectOSDDF() error { cmd := o.cephOSDDFCommand() - buf, _, err := o.conn.MonCommand(cmd) if err != nil { - log.Printf("failed sending Mon command %s: %s", cmd, err) + o.logger.WithError(err).WithField( + "args", string(cmd), + ).Error("error executing mon command") + return err } @@ -661,7 +665,10 @@ func (o *OSDCollector) collectOSDPerf() error { cmd := o.cephOSDPerfCommand() buf, _, err := o.conn.MonCommand(cmd) if err != nil { - log.Printf("failed sending Mon command %s: %s", cmd, err) + o.logger.WithError(err).WithField( + "args", string(cmd), + ).Error("error executing mon command") + return err } @@ -761,13 +768,15 @@ func (o *OSDCollector) buildOSDLabelCache() error { cmd := o.cephOSDTreeCommand() data, _, err := o.conn.MonCommand(cmd) if err != nil { - log.Printf("failed sending Mon command %s: %s", cmd, err) + o.logger.WithError(err).WithField( + "args", string(cmd), + ).Error("error executing mon command") + return err } cache, err := buildOSDLabels(data) if err != nil { - log.Printf("failed to decode OSD lables: %s", err) return err } o.osdLabelsCache = cache @@ -795,7 +804,10 @@ func (o *OSDCollector) collectOSDTreeDown(ch chan<- prometheus.Metric) error { cmd := o.cephOSDTreeCommand("down") buff, _, err := o.conn.MonCommand(cmd) if err != nil { - log.Printf("failed sending Mon command %s: %s", cmd, err) + o.logger.WithError(err).WithField( + "args", string(cmd), + ).Error("error executing mon command") + return err } @@ -830,7 +842,10 @@ func (o *OSDCollector) collectOSDDump() error { cmd := o.cephOSDDump() buff, _, err := o.conn.MonCommand(cmd) if err != nil { - log.Printf("failed sending Mon command %s: %s", cmd, err) + o.logger.WithError(err).WithField( + "args", string(cmd), + ).Error("error executing mon command") + return err } @@ -884,7 +899,10 @@ func (o *OSDCollector) performPGDumpBrief() error { cmd := o.cephPGDumpCommand() buf, _, err := o.conn.MonCommand(cmd) if err != nil { - log.Printf("failed sending Mon command %s: %s", cmd, err) + o.logger.WithError(err).WithField( + "args", string(cmd), + ).Error("error executing mon command") + return err } @@ -941,7 +959,7 @@ func (o *OSDCollector) cephOSDDump() []byte { "format": jsonFormat, }) if err != nil { - panic(err) + o.logger.WithError(err).Panic("error marshalling ceph osd dump") } return cmd } @@ -952,7 +970,7 @@ func (o *OSDCollector) cephOSDDFCommand() []byte { "format": jsonFormat, }) if err != nil { - panic(err) + o.logger.WithError(err).Panic("error marshalling ceph osd df") } return cmd } @@ -963,7 +981,7 @@ func (o *OSDCollector) cephOSDPerfCommand() []byte { "format": jsonFormat, }) if err != nil { - panic(err) + o.logger.WithError(err).Panic("error marshalling ceph osd perf") } return cmd } @@ -979,7 +997,7 @@ func (o *OSDCollector) cephOSDTreeCommand(states ...string) []byte { cmd, err := json.Marshal(req) if err != nil { - panic(err) + o.logger.WithError(err).Panic("error marshalling ceph osd tree") } return cmd } @@ -991,7 +1009,7 @@ func (o *OSDCollector) cephPGDumpCommand() []byte { "format": jsonFormat, }) if err != nil { - panic(err) + o.logger.WithError(err).Panic("error marshalling ceph pg dump") } return cmd } @@ -1003,7 +1021,7 @@ func (o *OSDCollector) cephPGQueryCommand(pgid string) []byte { "format": jsonFormat, }) if err != nil { - panic(err) + o.logger.WithError(err).Panic("error marshalling ceph pg query") } return cmd } @@ -1022,7 +1040,6 @@ func (o *OSDCollector) Describe(ch chan<- *prometheus.Desc) { // Collect sends all the collected metrics to the provided Prometheus channel. // It requires the caller to handle synchronization. func (o *OSDCollector) Collect(ch chan<- prometheus.Metric) { - // Reset daemon specifc metrics; daemons can leave the cluster o.CrushWeight.Reset() o.Depth.Reset() @@ -1039,28 +1056,34 @@ func (o *OSDCollector) Collect(ch chan<- prometheus.Metric) { o.OSDUp.Reset() o.buildOSDLabelCache() + o.logger.Debug("collecting OSD perf metrics") if err := o.collectOSDPerf(); err != nil { - log.Println("failed collecting OSD perf metrics:", err) + o.logger.WithError(err).Error("error collecting OSD perf metrics") } + o.logger.Debug("collecting OSD dump metrics") if err := o.collectOSDDump(); err != nil { - log.Println("failed collecting OSD dump metrics:", err) + o.logger.WithError(err).Error("error collecting OSD dump metrics") } + o.logger.Debug("collecting OSD df metrics") if err := o.collectOSDDF(); err != nil { - log.Println("failed collecting OSD df metrics:", err) + o.logger.WithError(err).Error("error collecting OSD df metrics") } + o.logger.Debug("collecting OSD tree down metrics") if err := o.collectOSDTreeDown(ch); err != nil { - log.Println("failed collecting OSD tree down metrics:", err) + o.logger.WithError(err).Error("error collecting OSD tree down metrics") } + o.logger.Debug("collecting PG dump metrics") if err := o.performPGDumpBrief(); err != nil { - log.Println("failed performing PG dump brief:", err) + o.logger.WithError(err).Error("error collecting PG dump metrics") } + o.logger.Debug("collecting OSD scrub metrics") if err := o.collectOSDScrubState(ch); err != nil { - log.Println("failed collecting OSD scrub metrics:", err) + o.logger.WithError(err).Error("error collecting OSD scrub metrics") } for _, metric := range o.collectorList() { diff --git a/collectors/osd_test.go b/collectors/osd_test.go index fb87358..686e76c 100644 --- a/collectors/osd_test.go +++ b/collectors/osd_test.go @@ -9,6 +9,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/sirupsen/logrus" ) const ( @@ -912,7 +913,7 @@ func TestOSDCollector(t *testing.T) { } { func() { conn := NewNoopConnWithCmdOut(tt.cmdOut) - collector := NewOSDCollector(conn, "ceph") + collector := NewOSDCollector(conn, "ceph", logrus.New()) if err := prometheus.Register(collector); err != nil { t.Fatalf("collector failed to register: %s", err) } diff --git a/collectors/pool.go b/collectors/pool.go index e83364d..937ef04 100644 --- a/collectors/pool.go +++ b/collectors/pool.go @@ -17,11 +17,11 @@ package collectors import ( "encoding/json" "fmt" - "log" "math" "strconv" "github.com/prometheus/client_golang/prometheus" + "github.com/sirupsen/logrus" ) const ( @@ -32,7 +32,8 @@ const ( // PoolInfoCollector gives information about each pool that exists in a given // ceph cluster. type PoolInfoCollector struct { - conn Conn + conn Conn + logger *logrus.Logger // PGNum contains the count of PGs allotted to a particular pool. PGNum *prometheus.GaugeVec @@ -63,7 +64,7 @@ type PoolInfoCollector struct { } // NewPoolInfoCollector displays information about each pool in the cluster. -func NewPoolInfoCollector(conn Conn, cluster string) *PoolInfoCollector { +func NewPoolInfoCollector(conn Conn, cluster string, logger *logrus.Logger) *PoolInfoCollector { var ( subSystem = "pool" poolLabels = []string{"pool", "profile", "root"} @@ -73,7 +74,8 @@ func NewPoolInfoCollector(conn Conn, cluster string) *PoolInfoCollector { labels["cluster"] = cluster return &PoolInfoCollector{ - conn: conn, + conn: conn, + logger: logger, PGNum: prometheus.NewGaugeVec( prometheus.GaugeOpts{ @@ -193,6 +195,10 @@ func (p *PoolInfoCollector) collect() error { cmd := p.cephInfoCommand() buf, _, err := p.conn.MonCommand(cmd) if err != nil { + p.logger.WithError(err).WithField( + "args", string(cmd), + ).Error("error executing mon command") + return err } @@ -238,9 +244,7 @@ func (p *PoolInfoCollector) cephInfoCommand() []byte { "format": "json", }) if err != nil { - // panic! because ideally in no world this hard-coded input - // should fail. - panic(err) + p.logger.WithError(err).Panic("error marshalling ceph osd pool ls") } return cmd } @@ -256,8 +260,9 @@ func (p *PoolInfoCollector) Describe(ch chan<- *prometheus.Desc) { // Collect extracts the current values of all the metrics and sends them to the // prometheus channel. func (p *PoolInfoCollector) Collect(ch chan<- prometheus.Metric) { + p.logger.Debug("collecting pool metrics") if err := p.collect(); err != nil { - log.Println("[ERROR] failed collecting pool usage metrics:", err) + p.logger.WithError(err).Error("error collecting pool metrics") return } @@ -314,11 +319,15 @@ func (p *PoolInfoCollector) getCrushRuleToRootMappings() map[int64]string { "format": "json", }) if err != nil { - panic(err) + p.logger.WithError(err).Panic("error marshalling ceph osd crush rule dump") } buf, _, err := p.conn.MonCommand(cmd) if err != nil { + p.logger.WithError(err).WithField( + "args", string(cmd), + ).Error("error executing mon command") + return mappings } @@ -332,6 +341,8 @@ func (p *PoolInfoCollector) getCrushRuleToRootMappings() map[int64]string { err = json.Unmarshal(buf, &rules) if err != nil { + p.logger.WithError(err).Error("error unmarshalling crush rules") + return mappings } diff --git a/collectors/pool_test.go b/collectors/pool_test.go index c37a085..396d876 100644 --- a/collectors/pool_test.go +++ b/collectors/pool_test.go @@ -16,7 +16,6 @@ package collectors import ( "io/ioutil" - "log" "net/http" "net/http/httptest" "regexp" @@ -24,11 +23,10 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/sirupsen/logrus" ) func TestPoolInfoCollector(t *testing.T) { - log.SetOutput(ioutil.Discard) - for _, tt := range []struct { input string reMatch, reUnmatch []*regexp.Regexp @@ -62,7 +60,7 @@ func TestPoolInfoCollector(t *testing.T) { }, } { func() { - collector := NewPoolInfoCollector(NewNoopConn(tt.input), "ceph") + collector := NewPoolInfoCollector(NewNoopConn(tt.input), "ceph", logrus.New()) if err := prometheus.Register(collector); err != nil { t.Fatalf("collector failed to register: %s", err) } diff --git a/collectors/pool_usage.go b/collectors/pool_usage.go index 1c1c417..1c6892a 100644 --- a/collectors/pool_usage.go +++ b/collectors/pool_usage.go @@ -16,15 +16,16 @@ package collectors import ( "encoding/json" - "log" "math" "github.com/prometheus/client_golang/prometheus" + "github.com/sirupsen/logrus" ) // PoolUsageCollector displays statistics about each pool in the Ceph cluster. type PoolUsageCollector struct { - conn Conn + conn Conn + logger *logrus.Logger // UsedBytes tracks the amount of bytes currently allocated for the pool. This // does not factor in the overcommitment made for individual images. @@ -67,7 +68,7 @@ type PoolUsageCollector struct { // NewPoolUsageCollector creates a new instance of PoolUsageCollector and returns // its reference. -func NewPoolUsageCollector(conn Conn, cluster string) *PoolUsageCollector { +func NewPoolUsageCollector(conn Conn, cluster string, logger *logrus.Logger) *PoolUsageCollector { var ( subSystem = "pool" poolLabel = []string{"pool"} @@ -77,7 +78,8 @@ func NewPoolUsageCollector(conn Conn, cluster string) *PoolUsageCollector { labels["cluster"] = cluster return &PoolUsageCollector{ - conn: conn, + conn: conn, + logger: logger, UsedBytes: prometheus.NewGaugeVec( prometheus.GaugeOpts{ @@ -232,6 +234,10 @@ func (p *PoolUsageCollector) collect() error { cmd := p.cephUsageCommand() buf, _, err := p.conn.MonCommand(cmd) if err != nil { + p.logger.WithError(err).WithField( + "args", string(cmd), + ).Error("error executing mon command") + return err } @@ -265,16 +271,12 @@ func (p *PoolUsageCollector) collect() error { p.WriteIO.WithLabelValues(pool.Name).Set(pool.Stats.WriteIO) p.WriteBytes.WithLabelValues(pool.Name).Set(pool.Stats.WriteBytes) - ioCtx, err := p.conn.OpenIOContext(pool.Name) + st, err := p.conn.GetPoolStats(pool.Name) if err != nil { - log.Println("[ERROR] failed to open IOContext:", err) - continue - } - defer ioCtx.Destroy() + p.logger.WithError(err).WithField( + "pool", pool.Name, + ).Error("error getting pool stats") - st, err := ioCtx.GetPoolStats() - if err != nil { - log.Println("[ERROR] failed to get pool stats:", err) continue } @@ -291,9 +293,7 @@ func (p *PoolUsageCollector) cephUsageCommand() []byte { "format": "json", }) if err != nil { - // panic! because ideally in no world this hard-coded input - // should fail. - panic(err) + p.logger.WithError(err).Panic("error marshalling ceph df detail") } return cmd } @@ -309,8 +309,9 @@ func (p *PoolUsageCollector) Describe(ch chan<- *prometheus.Desc) { // Collect extracts the current values of all the metrics and sends them to the // prometheus channel. func (p *PoolUsageCollector) Collect(ch chan<- prometheus.Metric) { + p.logger.Debug("collecting pool usage metrics") if err := p.collect(); err != nil { - log.Println("[ERROR] failed to collect pool usage metrics:", err) + p.logger.WithError(err).Error("error collecting pool usage metrics") return } diff --git a/collectors/pool_usage_test.go b/collectors/pool_usage_test.go index c9453e3..3840d3d 100644 --- a/collectors/pool_usage_test.go +++ b/collectors/pool_usage_test.go @@ -16,7 +16,6 @@ package collectors import ( "io/ioutil" - "log" "net/http" "net/http/httptest" "regexp" @@ -24,11 +23,10 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/sirupsen/logrus" ) func TestPoolUsageCollector(t *testing.T) { - log.SetOutput(ioutil.Discard) - for _, tt := range []struct { input string reMatch, reUnmatch []*regexp.Regexp @@ -176,7 +174,7 @@ func TestPoolUsageCollector(t *testing.T) { }, } { func() { - collector := NewPoolUsageCollector(NewNoopConn(tt.input), "ceph") + collector := NewPoolUsageCollector(NewNoopConn(tt.input), "ceph", logrus.New()) if err := prometheus.Register(collector); err != nil { t.Fatalf("collector failed to register: %s", err) } diff --git a/collectors/rgw.go b/collectors/rgw.go index 2be61d5..bb503a5 100644 --- a/collectors/rgw.go +++ b/collectors/rgw.go @@ -2,12 +2,12 @@ package collectors import ( "encoding/json" - "log" "os/exec" "strings" "time" "github.com/prometheus/client_golang/prometheus" + "github.com/sirupsen/logrus" ) const rgwGCTimeFormat = "2006-01-02 15:04:05" @@ -60,6 +60,7 @@ func rgwGetGCTaskList(config string) ([]byte, error) { type RGWCollector struct { config string background bool + logger *logrus.Logger // ActiveTasks reports the number of (expired) RGW GC tasks ActiveTasks *prometheus.GaugeVec @@ -76,12 +77,13 @@ type RGWCollector struct { // NewRGWCollector creates an instance of the RGWCollector and instantiates // the individual metrics that we can collect from the RGW service -func NewRGWCollector(cluster string, config string, background bool) *RGWCollector { +func NewRGWCollector(cluster string, config string, background bool, logger *logrus.Logger) *RGWCollector { labels := make(prometheus.Labels) labels["cluster"] = cluster rgw := &RGWCollector{ config: config, background: background, + logger: logger, getRGWGCTaskList: rgwGetGCTaskList, ActiveTasks: prometheus.NewGaugeVec( @@ -142,9 +144,10 @@ func (r *RGWCollector) collectorList() []prometheus.Collector { func (r *RGWCollector) backgroundCollect() error { for { + r.logger.WithField("background", r.background).Debug("collecting RGW GC stats") err := r.collect() if err != nil { - log.Println("Failed to collect RGW GC stats", err) + r.logger.WithField("background", r.background).WithError(err).Error("error collecting RGW GC stats") } time.Sleep(backgroundCollectInterval) } @@ -200,9 +203,10 @@ func (r *RGWCollector) Describe(ch chan<- *prometheus.Desc) { // It requires the caller to handle synchronization. func (r *RGWCollector) Collect(ch chan<- prometheus.Metric) { if !r.background { + r.logger.WithField("background", r.background).Debug("collecting RGW GC stats") err := r.collect() if err != nil { - log.Println("Failed to collect RGW GC stats", err) + r.logger.WithField("background", r.background).WithError(err).Error("error collecting RGW GC stats") } } diff --git a/collectors/rgw_test.go b/collectors/rgw_test.go index 85bcd7e..895bb03 100644 --- a/collectors/rgw_test.go +++ b/collectors/rgw_test.go @@ -10,6 +10,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/sirupsen/logrus" ) func TestRGWCollector(t *testing.T) { @@ -115,7 +116,7 @@ func TestRGWCollector(t *testing.T) { }, } { func() { - collector := NewRGWCollector("ceph", "", false) // run in foreground for testing + collector := NewRGWCollector("ceph", "", false, logrus.New()) // run in foreground for testing collector.getRGWGCTaskList = func(cluster string) ([]byte, error) { if tt.input != nil { return tt.input, nil diff --git a/exporter.go b/exporter.go index fcb891c..58176fd 100644 --- a/exporter.go +++ b/exporter.go @@ -23,12 +23,11 @@ import ( "syscall" "time" - "github.com/ceph/go-ceph/rados" "github.com/digitalocean/ceph_exporter/collectors" "github.com/ianschenck/envflag" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" - log "github.com/sirupsen/logrus" + "github.com/sirupsen/logrus" ) const ( @@ -45,7 +44,7 @@ const keepAlive time.Duration = 3 * time.Minute type emfileAwareTcpListener struct { *net.TCPListener - logger *log.Logger + logger *logrus.Logger } func (ln emfileAwareTcpListener) Accept() (c net.Conn, err error) { @@ -57,7 +56,7 @@ func (ln emfileAwareTcpListener) Accept() (c net.Conn, err error) { } } // Default return - return nil, err + return } tc.SetKeepAlive(true) tc.SetKeepAlivePeriod(keepAlive) @@ -72,7 +71,7 @@ func (ln emfileAwareTcpListener) Accept() (c net.Conn, err error) { type CephExporter struct { mu sync.Mutex collectors []prometheus.Collector - logger *log.Logger + logger *logrus.Logger } // Verify that the exporter implements the interface correctly. @@ -81,15 +80,15 @@ var _ prometheus.Collector = &CephExporter{} // NewCephExporter creates an instance to CephExporter and returns a reference // to it. We can choose to enable a collector to extract stats out of by adding // it to the list of collectors. -func NewCephExporter(conn *rados.Conn, cluster string, config string, rgwMode int, logger *log.Logger) *CephExporter { +func NewCephExporter(conn collectors.Conn, cluster string, config string, rgwMode int, logger *logrus.Logger) *CephExporter { c := &CephExporter{ collectors: []prometheus.Collector{ - collectors.NewClusterUsageCollector(conn, cluster), - collectors.NewPoolUsageCollector(conn, cluster), - collectors.NewPoolInfoCollector(conn, cluster), - collectors.NewClusterHealthCollector(conn, cluster), - collectors.NewMonitorCollector(conn, cluster), - collectors.NewOSDCollector(conn, cluster), + collectors.NewClusterUsageCollector(conn, cluster, logger), + collectors.NewPoolUsageCollector(conn, cluster, logger), + collectors.NewPoolInfoCollector(conn, cluster, logger), + collectors.NewClusterHealthCollector(conn, cluster, logger), + collectors.NewMonitorCollector(conn, cluster, logger), + collectors.NewOSDCollector(conn, cluster, logger), }, logger: logger, } @@ -97,12 +96,12 @@ func NewCephExporter(conn *rados.Conn, cluster string, config string, rgwMode in switch rgwMode { case collectors.RGWModeForeground: c.collectors = append(c.collectors, - collectors.NewRGWCollector(cluster, config, false), + collectors.NewRGWCollector(cluster, config, false, logger), ) case collectors.RGWModeBackground: c.collectors = append(c.collectors, - collectors.NewRGWCollector(cluster, config, true), + collectors.NewRGWCollector(cluster, config, true, logger), ) case collectors.RGWModeDisabled: @@ -152,9 +151,14 @@ func main() { envflag.Parse() - logger := log.New() + logger := logrus.New() + logger.SetFormatter(&logrus.TextFormatter{ + FullTimestamp: true, + }) - if v, err := log.ParseLevel(*logLevel); err != nil { + if v, err := logrus.ParseLevel(*logLevel); err != nil { + logger.WithError(err).Warn("error setting log level") + } else { logger.SetLevel(v) } @@ -179,21 +183,20 @@ func main() { } for _, cluster := range clusterConfigs { - conn, err := collectors.CreateRadosConn(cluster.User, cluster.ConfigFile, *cephRadosOpTimeout) - if err != nil { - logger.WithError(err).WithFields(log.Fields{ - "cephCluster": cluster.ClusterLabel, - "cephUser": cluster.User, - "cephConfig": cluster.ConfigFile, - }).Fatal("error creating rados connection") - } - - // defer Shutdown to program exit - defer conn.Shutdown() - - prometheus.MustRegister(NewCephExporter(conn, cluster.ClusterLabel, cluster.ConfigFile, *rgwMode, logger)) - - log.WithField("cephCluster", cluster.ClusterLabel).Info("exporting cluster") + conn := collectors.NewRadosConn( + cluster.User, + cluster.ConfigFile, + *cephRadosOpTimeout, + logger) + + prometheus.MustRegister(NewCephExporter( + conn, + cluster.ClusterLabel, + cluster.ConfigFile, + *rgwMode, + logger)) + + logger.WithField("cluster", cluster.ClusterLabel).Info("exporting cluster") } http.Handle(*metricsPath, promhttp.Handler()) @@ -207,17 +210,17 @@ func main() { `)) }) - logger.WithField("endpoint", *metricsAddr).Info("starting ceph_exporter") + logger.WithField("endpoint", *metricsAddr).Info("starting ceph_exporter listener") // Below is essentially http.ListenAndServe(), but using our custom // emfileAwareTcpListener that will die if we run out of file descriptors ln, err := net.Listen("tcp", *metricsAddr) if err != nil { - log.WithError(err).Fatal("error creating listener") + logrus.WithError(err).Fatal("error creating listener") } err = http.Serve(emfileAwareTcpListener{ln.(*net.TCPListener), logger}, nil) if err != nil { - log.WithError(err).Fatal("error serving requests") + logrus.WithError(err).Fatal("error serving requests") } }