Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cleanup obsolete local files for alertmanager. #3910

Merged
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

## master / unreleased

* [CHANGE] Alertmanager now removes local files after Alertmanager is no longer running for removed or resharded user. #3910
* [CHANGE] Alertmanager now stores local files in per-tenant folders. Files stored by Alertmanager previously are migrated to new hierarchy. Support for this migration will be removed in Cortex 1.10. #3910
* [ENHANCEMENT] Ruler: optimized `<prefix>/api/v1/rules` and `<prefix>/api/v1/alerts` when ruler sharding is enabled. #3916
* [ENHANCEMENT] Ruler: added the following metrics when ruler sharding is enabled: #3916
* `cortex_ruler_clients`
Expand Down
10 changes: 10 additions & 0 deletions development/tsdb-blocks-storage-s3/config/rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,13 @@ groups:
rules:
- record: up:count
expr: count(up)

- name: example2
rules:
- alert: TooManyServices
expr: count(up) > 1
for: 1m
labels:
severity: page
annotations:
summary: Too many services
34 changes: 23 additions & 11 deletions pkg/alertmanager/alertmanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,19 +43,28 @@ import (
"github.com/prometheus/common/route"
)

const notificationLogMaintenancePeriod = 15 * time.Minute
const (
// MaintenancePeriod is used for periodic storing of silences and notifications to local file.
maintenancePeriod = 15 * time.Minute

// Filenames used within tenant-directory
notificationLogSnapshot = "notifications"
silencesSnapshot = "silences"
templatesDir = "templates"
)

// Config configures an Alertmanager.
type Config struct {
UserID string
// Used to persist notification logs and silences on disk.
DataDir string
UserID string
Logger log.Logger
Peer *cluster.Peer
PeerTimeout time.Duration
Retention time.Duration
ExternalURL *url.URL

// Tenant-specific local directory where AM can store its state (notifications, silences, templates). When AM is stopped, entire dir is removed.
TenantDataDir string

ShardingEnabled bool
ReplicationFactor int
ReplicateStateFunc func(context.Context, string, *clusterpb.Part) error
Expand Down Expand Up @@ -115,6 +124,10 @@ type State interface {

// New creates a new Alertmanager.
func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) {
if cfg.TenantDataDir == "" {
return nil, fmt.Errorf("directory for tenant-specific AlertManager is not configured")
}

am := &Alertmanager{
cfg: cfg,
logger: log.With(cfg.Logger, "user", cfg.UserID),
Expand Down Expand Up @@ -144,12 +157,11 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) {
}

am.wg.Add(1)
nflogID := fmt.Sprintf("nflog:%s", cfg.UserID)
var err error
am.nflog, err = nflog.New(
nflog.WithRetention(cfg.Retention),
nflog.WithSnapshot(filepath.Join(cfg.DataDir, nflogID)),
nflog.WithMaintenance(notificationLogMaintenancePeriod, am.stop, am.wg.Done),
nflog.WithSnapshot(filepath.Join(cfg.TenantDataDir, notificationLogSnapshot)),
nflog.WithMaintenance(maintenancePeriod, am.stop, am.wg.Done),
nflog.WithMetrics(am.registry),
nflog.WithLogger(log.With(am.logger, "component", "nflog")),
)
Expand All @@ -162,9 +174,9 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) {

am.marker = types.NewMarker(am.registry)

silencesID := fmt.Sprintf("silences:%s", cfg.UserID)
silencesFile := filepath.Join(cfg.TenantDataDir, silencesSnapshot)
am.silences, err = silence.New(silence.Options{
SnapshotFile: filepath.Join(cfg.DataDir, silencesID),
SnapshotFile: silencesFile,
Retention: cfg.Retention,
Logger: log.With(am.logger, "component", "silences"),
Metrics: am.registry,
Expand All @@ -180,7 +192,7 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) {

am.wg.Add(1)
go func() {
am.silences.Maintenance(15*time.Minute, filepath.Join(cfg.DataDir, silencesID), am.stop)
am.silences.Maintenance(maintenancePeriod, silencesFile, am.stop)
am.wg.Done()
}()

Expand Down Expand Up @@ -240,7 +252,7 @@ func (am *Alertmanager) ApplyConfig(userID string, conf *config.Config, rawCfg s
templateFiles := make([]string, len(conf.Templates))
if len(conf.Templates) > 0 {
for i, t := range conf.Templates {
templateFiles[i] = filepath.Join(am.cfg.DataDir, "templates", userID, t)
templateFiles[i] = filepath.Join(am.cfg.TenantDataDir, templatesDir, t)
}
}

Expand Down
8 changes: 4 additions & 4 deletions pkg/alertmanager/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,14 +153,14 @@ func validateUserConfig(logger log.Logger, cfg alertspb.AlertConfigDesc) error {
// not to configured data dir, and on the flipside, it'll fail if we can't write
// to tmpDir. Ignoring both cases for now as they're ultra rare but will revisit if
// we see this in the wild.
tmpDir, err := ioutil.TempDir("", "validate-config")
userTempDir, err := ioutil.TempDir("", "validate-config-"+cfg.User)
if err != nil {
return err
}
defer os.RemoveAll(tmpDir)
defer os.RemoveAll(userTempDir)

for _, tmpl := range cfg.Templates {
_, err := createTemplateFile(tmpDir, cfg.User, tmpl.Filename, tmpl.Body)
_, err := storeTemplateFile(userTempDir, tmpl.Filename, tmpl.Body)
if err != nil {
level.Error(logger).Log("msg", "unable to create template file", "err", err, "user", cfg.User)
return fmt.Errorf("unable to create template file '%s'", tmpl.Filename)
Expand All @@ -169,7 +169,7 @@ func validateUserConfig(logger log.Logger, cfg alertspb.AlertConfigDesc) error {

templateFiles := make([]string, len(amCfg.Templates))
for i, t := range amCfg.Templates {
templateFiles[i] = filepath.Join(tmpDir, "templates", cfg.User, t)
templateFiles[i] = filepath.Join(userTempDir, t)
}

_, err = template.FromGlobs(templateFiles...)
Expand Down
Loading