Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement configurable caching #46

Merged
merged 2 commits into from
Sep 29, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ For a list of the available collectors, see [Collectors doc page](docs/collector

For flags and environment variables, see [Configuration doc page](docs/configuration.md).

## Caching

Optional caching can be enabled to prevent performance issues caused by this exporter, see [Caching doc page](docs/caching.md).

## Running in Docker

The container image is available from [Docker Hub](https://hub.docker.com/) and [Quay.io](https://quay.io/):
Expand Down
96 changes: 85 additions & 11 deletions cmd/dellhw_exporter/dellhw_exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ type CmdLineOpts struct {
enabledCollectors string
omReportExecutable string
cmdTimeout int64

cachingEnabled bool
cacheDuration int64
}

var (
Expand All @@ -62,6 +65,22 @@ var (
type DellHWCollector struct {
lastCollectTime time.Time
collectors map[string]collector.Collector

// Cache related
cachingEnabled bool
cacheDuration time.Duration
cache []prometheus.Metric
cacheMutex sync.Mutex
}

func NewDellHWCollector(collectors map[string]collector.Collector, cachingEnabled bool, cacheDurationSeconds int64) *DellHWCollector {
return &DellHWCollector{
cache: make([]prometheus.Metric, 0),
lastCollectTime: time.Unix(0, 0),
collectors: collectors,
cachingEnabled: cachingEnabled,
cacheDuration: time.Duration(cacheDurationSeconds) * time.Second,
}
}

func init() {
Expand All @@ -76,6 +95,9 @@ func init() {
flags.StringVar(&opts.metricsAddr, "web-listen-address", ":9137", "The address to listen on for HTTP requests")
flags.StringVar(&opts.metricsPath, "web-telemetry-path", "/metrics", "Path the metrics will be exposed under")

flags.BoolVar(&opts.cachingEnabled, "cache-enabled", false, "Enable metrics caching to reduce load")
flags.Int64Var(&opts.cacheDuration, "cache-duration", 20, "Cache duration in seconds")

flags.SetNormalizeFunc(normalizeFlags)
flags.SortFlags = true
}
Expand Down Expand Up @@ -128,22 +150,68 @@ func parseFlagsAndEnvVars() error {
}

// Describe implements the prometheus.Collector interface.
func (n DellHWCollector) Describe(ch chan<- *prometheus.Desc) {
func (n *DellHWCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- scrapeDurationDesc
ch <- scrapeSuccessDesc
}

// Collect implements the prometheus.Collector interface.
func (n DellHWCollector) Collect(ch chan<- prometheus.Metric) {
wg := sync.WaitGroup{}
wg.Add(len(n.collectors))
for name, c := range n.collectors {
go func(name string, c collector.Collector) {
execute(name, c, ch)
wg.Done()
}(name, c)
func (n *DellHWCollector) Collect(outgoingCh chan<- prometheus.Metric) {
if n.cachingEnabled {
n.cacheMutex.Lock()
defer n.cacheMutex.Unlock()

expiry := n.lastCollectTime.Add(n.cacheDuration)
if time.Now().Before(expiry) {
log.Debugf("Using cache. Now: %s, Expiry: %s, LastCollect: %s", time.Now().String(), expiry.String(), n.lastCollectTime.String())
for _, cachedMetric := range n.cache {
log.Debugf("Pushing cached metric %s to outgoingCh", cachedMetric.Desc().String())
outgoingCh <- cachedMetric
}
return
}
// Clear cache, but keep slice
n.cache = n.cache[:0]
}
wg.Wait()

metricsCh := make(chan prometheus.Metric)

// Wait to ensure outgoingCh is not closed before the goroutine is finished
wgOutgoing := sync.WaitGroup{}
wgOutgoing.Add(1)
go func() {
for metric := range metricsCh {
outgoingCh <- metric
if n.cachingEnabled {
log.Debugf("Appending metric %s to cache", metric.Desc().String())
n.cache = append(n.cache, metric)
}
}
log.Debug("Finished pushing metrics from metricsCh to outgoingCh")
wgOutgoing.Done()
}()

wgCollection := sync.WaitGroup{}
wgCollection.Add(len(n.collectors))
for name, coll := range n.collectors {
go func(name string, coll collector.Collector) {
execute(name, coll, metricsCh)
wgCollection.Done()
}(name, coll)
}

log.Debug("Waiting for collectors")
wgCollection.Wait()
log.Debug("Finished waiting for collectors")

n.lastCollectTime = time.Now()
log.Debugf("Updated lastCollectTime to %s", n.lastCollectTime.String())

close(metricsCh)

log.Debug("Waiting for outgoing Adapter")
wgOutgoing.Wait()
log.Debug("Finished waiting for outgoing Adapter")
}

func execute(name string, c collector.Collector, ch chan<- prometheus.Metric) {
Expand Down Expand Up @@ -221,6 +289,12 @@ func main() {
log.Warnf("Not setting command timeout because it is zero")
}

if opts.cachingEnabled {
log.Infof("Caching enabled. Cache Duration: %ds", opts.cacheDuration)
} else {
log.Info("Caching is disabled by default")
}

omrOpts := &omreport.Options{
OMReportExecutable: opts.omReportExecutable,
}
Expand All @@ -236,7 +310,7 @@ func main() {
log.Infof(" - %s", n)
}

if err = prometheus.Register(DellHWCollector{lastCollectTime: time.Now(), collectors: collectors}); err != nil {
if err = prometheus.Register(NewDellHWCollector(collectors, opts.cachingEnabled, opts.cacheDuration)); err != nil {
log.Fatalf("Couldn't register collector: %s", err)
}
handler := promhttp.HandlerFor(prometheus.DefaultGatherer,
Expand Down
31 changes: 31 additions & 0 deletions docs/caching.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
The dellhw_exporter can be configured to cache the results to prevent unnecessary executions of `omreport`, which can lead to high load.

## Caching

Why do we even need caching in an exporter?

Unfortunately Prometheus itself does not provide a 'real' High-Availability concept. The Prometheus authors recommend running identical Prometheus instances to achieve HA (see [Prometheus FAQ](https://prometheus.io/docs/introduction/faq/#can-prometheus-be-made-highly-available)), hence the exporters will be scraped multiple times within one scrape interval.

Besides the problem, that not all instances are retrieving the identical metric values, this can also produce high load if the actual collection of metrics is 'expensive'. While the first problem is not a real use case, because Prometheus does not claim to be consistent, the second problem is a real problem and valid use case for caching.

In particular the `dellhw_exporter`, since the underlying `omreport` calls produce high load. This is caused by many drivers collecting data from different components.

### Configuration

As you may have seen in [the Configuration doc page](docs/configuration.md) there are two caching related configuration parameters for enablement and how long the cache should be valid.

```console
--cache-enabled bool Enable caching (default false)
--cache-duration int Duration in seconds for the cache lifetime (default 20)
```

If you want to retrieve new metrics on each scrape, but want to prevent multiple collections because of multiple Prometheus instances, it is a good idea to set the cache-duration equal to your job's `scrape_interval`. If the `scrape_interval` is even less than the default value it can be useful to set a different `cache-duration`, maybe 2-3 times of the `scrape_interval`.


### Implementation details

An additional adapter channel is used to retrieve the collected metrics and put them into an array if caching is enabled. A mutex is used to prevent concurrent collections and concurrent write operations to the cache.

Since the metrics are pushed into a "local" channel instead of the channel passed by the Prometheus library directly, we need a second waitgroup. The first waitgroup ensures that all collectors have finished. The second waitgroup ensures that all metrics are written to the outgoing channel before the method returns. This is needed because the Prometheus library will close the channel once the method returns.

For further details please see the initial [Caching PR](https://github.com/galexrt/dellhw_exporter/pull/46).
4 changes: 4 additions & 0 deletions docs/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ Usage of dellhw_exporter:
--version Show version information
--web-listen-address string The address to listen on for HTTP requests (default ":9137")
--web-telemetry-path string Path the metrics will be exposed under (default "/metrics")
--cache-enabled bool Enable caching (default false)
--cache-duration int Duration in seconds for the cache lifetime (default 20)
```

## Environment Variables
Expand All @@ -30,6 +32,8 @@ DELLHW_EXPORTER_HELP
DELLHW_EXPORTER_VERSION
DELLHW_EXPORTER_WEB_LISTEN_ADDRESS
DELLHW_EXPORTER_WEB_TELEMETRY_PATH
DELLHW_EXPORTER_CACHE_ENABLED
DELLHW_EXPORTER_CACHE_DURATION
```

### Docker specific Environment Variables
Expand Down