This repository has been archived by the owner on Aug 23, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 105
/
init.go
142 lines (113 loc) · 6.58 KB
/
init.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
// Package mdata stands for "managed data" or "metrics data" if you will
// it has all the stuff to keep metric data in memory, store it, and synchronize
// save states over the network
package mdata
import (
"flag"
"io/ioutil"
"github.com/grafana/globalconf"
"github.com/grafana/metrictank/conf"
"github.com/grafana/metrictank/stats"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
log "github.com/sirupsen/logrus"
)
// Possible reason labels for Prometheus metric discarded_samples_total
const (
sampleOutOfOrder = "sample-out-of-order"
receivedTooLate = "received-too-late"
newValueForTimestamp = "new-value-for-timestamp"
)
var (
// metric tank.chunk_operations.create is a counter of how many chunks are created
chunkCreate = stats.NewCounter32("tank.chunk_operations.create")
// metric tank.chunk_operations.clear is a counter of how many chunks are cleared (replaced by new chunks)
chunkClear = stats.NewCounter32("tank.chunk_operations.clear")
// metric tank.metrics_reordered is the number of points received that are going back in time, but are still
// within the reorder window. in such a case they will be inserted in the correct order.
// E.g. if the reorder window is 60 (datapoints) then points may be inserted at random order as long as their
// ts is not older than the 60th datapoint counting from the newest.
metricsReordered = stats.NewCounter32("tank.metrics_reordered")
// metric tank.discarded.sample-out-of-order is points that go back in time beyond the scope of the optional reorder window.
// these points will end up being dropped and lost.
discardedSampleOutOfOrder = stats.NewCounterRate32("tank.discarded.sample-out-of-order")
// metric tank.discarded.received-too-late is points received for the most recent chunk
// when that chunk is already being "closed", ie the end-of-stream marker has been written to the chunk.
// this indicates that your GC is actively sealing chunks and saving them before you have the chance to send
// your (infrequent) updates. Any points revcieved for a chunk that has already been closed are discarded.
discardedReceivedTooLate = stats.NewCounterRate32("tank.discarded.received-too-late")
// metric tank.discarded.new-value-for-timestamp is points that have timestamps for which we already have data points.
// these points are discarded.
// data points can be incorrectly classified as metric tank.discarded.sample-out-of-order even when the timestamp
// has already been used. This happens in two cases:
// - when the reorder buffer is enabled, if the point is older than the reorder buffer retention window
// - when the reorder buffer is disabled, if the point is older than the last data point
discardedNewValueForTimestamp = stats.NewCounterRate32("tank.discarded.new-value-for-timestamp")
// metric tank.discarded.unknown is points that have been discarded for unknown reasons.
discardedUnknown = stats.NewCounterRate32("tank.discarded.unknown")
// metric tank.total_points is the number of points currently held in the in-memory ringbuffer
totalPoints = stats.NewGauge64("tank.total_points")
// metric mem.to_iter is how long it takes to transform in-memory chunks to iterators
memToIterDuration = stats.NewLatencyHistogram15s32("mem.to_iter")
// metric tank.persist is how long it takes to persist a chunk (and chunks preceding it)
// this is subject to backpressure from the store when the store's queue runs full
persistDuration = stats.NewLatencyHistogram15s32("tank.persist")
// metric tank.metrics_active is the number of currently known metrics (excl rollup series), measured every second
metricsActive = stats.NewGauge32("tank.metrics_active")
// metric tank.gc_metric is the number of times the metrics GC is about to inspect a metric (series)
gcMetric = stats.NewCounter32("tank.gc_metric")
// metric recovered_errors.aggmetric.getaggregated.bad-consolidator is how many times we detected an GetAggregated call
// with an incorrect consolidator specified
badConsolidator = stats.NewCounter32("recovered_errors.aggmetric.getaggregated.bad-consolidator")
// metric recovered_errors.aggmetric.getaggregated.bad-aggspan is how many times we detected an GetAggregated call
// with an incorrect aggspan specified
badAggSpan = stats.NewCounter32("recovered_errors.aggmetric.getaggregated.bad-aggspan")
// set either via ConfigProcess or from the unit tests. other code should not touch
Aggregations conf.Aggregations
Schemas conf.Schemas
schemasFile = "/etc/metrictank/storage-schemas.conf"
aggFile = "/etc/metrictank/storage-aggregation.conf"
promActiveMetrics = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "metrictank",
Name: "metrics_active",
Help: "Current # of active metrics",
}, []string{"org"})
PromDiscardedSamples = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: "metrictank",
Name: "discarded_samples_total",
Help: "Total # of samples that were discarded",
}, []string{"reason", "org"})
)
func ConfigSetup() {
retentionConf := flag.NewFlagSet("retention", flag.ExitOnError)
retentionConf.StringVar(&schemasFile, "schemas-file", "/etc/metrictank/storage-schemas.conf", "path to storage-schemas.conf file")
retentionConf.StringVar(&aggFile, "aggregations-file", "/etc/metrictank/storage-aggregation.conf", "path to storage-aggregation.conf file")
globalconf.Register("retention", retentionConf, flag.ExitOnError)
}
func ConfigProcess() {
var err error
// === read storage-schemas.conf ===
// graphite behavior: abort on any config reading errors, but skip any rules that have problems.
// at the end, add a default schema of 7 days of minutely data.
// we are stricter and don't tolerate any errors, that seems in the user's best interest.
Schemas, err = conf.ReadSchemas(schemasFile)
if err != nil {
log.Fatalf("can't read schemas file %q: %s", schemasFile, err.Error())
}
// === read storage-aggregation.conf ===
// graphite behavior:
// continue if file can't be read. (e.g. file is optional) but quit if other error reading config
// always add a default rule with xFilesFactor None and aggregationMethod None
// (which get interpreted by whisper as 0.5 and avg) at the end.
// since we can't distinguish errors reading vs parsing, we'll just try a read separately first
_, err = ioutil.ReadFile(aggFile)
if err == nil {
Aggregations, err = conf.ReadAggregations(aggFile)
if err != nil {
log.Fatalf("can't read storage-aggregation file %q: %s", aggFile, err.Error())
}
} else {
log.Infof("Could not read %s: %s: using defaults", aggFile, err)
Aggregations = conf.NewAggregations()
}
}