From aabf7c6b6e5cd0bfed9ade5980bcf61ee99963f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Thu, 4 Mar 2021 15:47:16 +0100 Subject: [PATCH 01/14] Cleanup obsolete local files for alertmanager. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- .../tsdb-blocks-storage-s3/config/rules.yaml | 10 +++ pkg/alertmanager/alertmanager.go | 13 +++- pkg/alertmanager/multitenant.go | 73 ++++++++++++++++++- pkg/alertmanager/multitenant_test.go | 49 +++++++++++++ 4 files changed, 137 insertions(+), 8 deletions(-) diff --git a/development/tsdb-blocks-storage-s3/config/rules.yaml b/development/tsdb-blocks-storage-s3/config/rules.yaml index acffa8ada9..b65a4590da 100644 --- a/development/tsdb-blocks-storage-s3/config/rules.yaml +++ b/development/tsdb-blocks-storage-s3/config/rules.yaml @@ -4,3 +4,13 @@ groups: rules: - record: up:count expr: count(up) + + - name: example2 + rules: + - alert: TooManyServices + expr: count(up) > 1 + for: 1m + labels: + severity: page + annotations: + summary: Too many services diff --git a/pkg/alertmanager/alertmanager.go b/pkg/alertmanager/alertmanager.go index b0eaf0b81a..8b828d97a9 100644 --- a/pkg/alertmanager/alertmanager.go +++ b/pkg/alertmanager/alertmanager.go @@ -43,7 +43,13 @@ import ( "github.com/prometheus/common/route" ) -const notificationLogMaintenancePeriod = 15 * time.Minute +const ( + notificationLogMaintenancePeriod = 15 * time.Minute + + // Per-tenant files have these prefixes. + nflogPrefix = "nflog:" + silencesPrefix = "silences:" +) // Config configures an Alertmanager. type Config struct { @@ -144,11 +150,10 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) { } am.wg.Add(1) - nflogID := fmt.Sprintf("nflog:%s", cfg.UserID) var err error am.nflog, err = nflog.New( nflog.WithRetention(cfg.Retention), - nflog.WithSnapshot(filepath.Join(cfg.DataDir, nflogID)), + nflog.WithSnapshot(filepath.Join(cfg.DataDir, nflogPrefix+cfg.UserID)), nflog.WithMaintenance(notificationLogMaintenancePeriod, am.stop, am.wg.Done), nflog.WithMetrics(am.registry), nflog.WithLogger(log.With(am.logger, "component", "nflog")), @@ -162,7 +167,7 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) { am.marker = types.NewMarker(am.registry) - silencesID := fmt.Sprintf("silences:%s", cfg.UserID) + silencesID := silencesPrefix + cfg.UserID am.silences, err = silence.New(silence.Options{ SnapshotFile: filepath.Join(cfg.DataDir, silencesID), Retention: cfg.Retention, diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index 23f7ef2dcc..41d9968d05 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -10,6 +10,7 @@ import ( "net/url" "os" "path/filepath" + "strings" "sync" "time" @@ -551,6 +552,8 @@ func (am *MultitenantAlertmanager) loadAndSyncConfigs(ctx context.Context, syncR } am.syncConfigs(cfgs) + am.deleteObsoleteLocalFiles() + return nil } @@ -636,20 +639,27 @@ func (am *MultitenantAlertmanager) syncConfigs(cfgs map[string]alertspb.AlertCon am.multitenantMetrics.lastReloadSuccessfulTimestamp.WithLabelValues(user).SetToCurrentTime() } + userAlertmanagersToStop := map[string]*Alertmanager{} + am.alertmanagersMtx.Lock() - defer am.alertmanagersMtx.Unlock() for userID, userAM := range am.alertmanagers { if _, exists := cfgs[userID]; !exists { - level.Info(am.logger).Log("msg", "deactivating per-tenant alertmanager", "user", userID) - userAM.Stop() + userAlertmanagersToStop[userID] = userAM delete(am.alertmanagers, userID) delete(am.cfgs, userID) am.multitenantMetrics.lastReloadSuccessful.DeleteLabelValues(userID) am.multitenantMetrics.lastReloadSuccessfulTimestamp.DeleteLabelValues(userID) am.alertmanagerMetrics.removeUserRegistry(userID) - level.Info(am.logger).Log("msg", "deactivated per-tenant alertmanager", "user", userID) } } + am.alertmanagersMtx.Unlock() + + // Now stop alertmanagers and wait until they are really stopped, without holding lock. + for userID, userAM := range userAlertmanagersToStop { + level.Info(am.logger).Log("msg", "deactivating per-tenant alertmanager", "user", userID) + userAM.StopAndWait() + level.Info(am.logger).Log("msg", "deactivated per-tenant alertmanager", "user", userID) + } } // setConfig applies the given configuration to the alertmanager for `userID`, @@ -944,6 +954,61 @@ func (am *MultitenantAlertmanager) UpdateState(ctx context.Context, part *cluste return &alertmanagerpb.UpdateStateResponse{Status: alertmanagerpb.OK}, nil } +// deleteObsoleteLocalFiles finds local files that we no longer need, and deletes them. +func (am *MultitenantAlertmanager) deleteObsoleteLocalFiles() { + userFilesToDelete := am.getSnapshotFilesPerUser() + + am.alertmanagersMtx.Lock() + for userID := range am.alertmanagers { + delete(userFilesToDelete, userID) // Don't delete files for users we're managing. + } + am.alertmanagersMtx.Unlock() + + // And delete remaining files. + for userID, files := range userFilesToDelete { + for _, fn := range files { + if err := os.Remove(fn); err != nil { + level.Warn(am.logger).Log("msg", "failed to delete local state file for user", "file", fn, "user", userID, "err", err) + } else { + level.Info(am.logger).Log("msg", "deleted local state file for user", "file", fn, "user", userID) + } + } + } +} + +func (am *MultitenantAlertmanager) getSnapshotFilesPerUser() map[string][]string { + files, err := ioutil.ReadDir(am.cfg.DataDir) + if err != nil { + level.Warn(am.logger).Log("msg", "failed to list file in data dir", "dir", am.cfg.DataDir, "err", err) + return nil + } + + result := map[string][]string{} + + for _, f := range files { + if f.IsDir() { + continue + } + + fullPath := filepath.Join(am.cfg.DataDir, f.Name()) + + switch { + case strings.HasPrefix(f.Name(), nflogPrefix): + userID := strings.TrimPrefix(f.Name(), nflogPrefix) + result[userID] = append(result[userID], fullPath) + + case strings.HasPrefix(f.Name(), silencesPrefix): + userID := strings.TrimPrefix(f.Name(), silencesPrefix) + result[userID] = append(result[userID], fullPath) + + default: + level.Warn(am.logger).Log("msg", "ignoring unknown local data file", "file", fullPath) + } + } + + return result +} + // StatusHandler shows the status of the alertmanager. type StatusHandler struct { am *MultitenantAlertmanager diff --git a/pkg/alertmanager/multitenant_test.go b/pkg/alertmanager/multitenant_test.go index d8b46a4dd2..9c2fcd3217 100644 --- a/pkg/alertmanager/multitenant_test.go +++ b/pkg/alertmanager/multitenant_test.go @@ -12,6 +12,7 @@ import ( "net/http/httptest" "net/http/pprof" "os" + "path/filepath" "regexp" "strings" "testing" @@ -194,6 +195,54 @@ func TestMultitenantAlertmanager_loadAndSyncConfigs(t *testing.T) { `), "cortex_alertmanager_config_last_reload_successful")) } +func TestMultitenantAlertmanager_deleteObsoleteLocalFiles(t *testing.T) { + ctx := context.Background() + + const ( + user1 = "user1" + user2 = "user2" + ) + + store := prepareInMemoryAlertStore() + require.NoError(t, store.SetAlertConfig(ctx, alertspb.AlertConfigDesc{ + User: user2, + RawConfig: simpleConfigOne, + Templates: []*alertspb.TemplateDesc{}, + })) + + reg := prometheus.NewPedanticRegistry() + cfg := mockAlertmanagerConfig(t) + am, err := createMultitenantAlertmanager(cfg, nil, nil, store, nil, log.NewNopLogger(), reg) + require.NoError(t, err) + + testUser1File1 := createFile(t, cfg.DataDir, silencesPrefix+user1) + testUser1File2 := createFile(t, cfg.DataDir, nflogPrefix+user1) + testUser2File := createFile(t, cfg.DataDir, nflogPrefix+user2) + + files := am.getSnapshotFilesPerUser() + require.Equal(t, 2, len(files)) + require.ElementsMatch(t, []string{testUser1File1, testUser1File2}, files[user1]) + require.ElementsMatch(t, []string{testUser2File}, files[user2]) + + // Ensure the configs are synced correctly + err = am.loadAndSyncConfigs(context.Background(), reasonPeriodic) + require.NoError(t, err) + + // loadAndSyncConfigs also cleans up obsolete files. Let's verify that. + files = am.getSnapshotFilesPerUser() + + require.Nil(t, files[user1]) // has no configuration, files were deleted + require.NotNil(t, files[user2]) // has config, files survived +} + +func createFile(t *testing.T, dir string, name string) string { + path := filepath.Join(dir, name) + f, err := os.Create(path) + require.NoError(t, err) + require.NoError(t, f.Close()) + return path +} + func TestMultitenantAlertmanager_NoExternalURL(t *testing.T) { amConfig := mockAlertmanagerConfig(t) amConfig.ExternalURL = flagext.URLValue{} // no external URL From 8eb1cf177d066cf649b77662884ebbaa14a19c4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Thu, 4 Mar 2021 15:49:56 +0100 Subject: [PATCH 02/14] CHANGELOG.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 60e7b15ea1..962be275a9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ * [ENHANCEMENT] Ruler: added the following metrics when ruler sharding is enabled: #3916 * `cortex_ruler_clients` * `cortex_ruler_client_request_duration_seconds` +* [ENHANCEMENT] Alertmanager: cleanup obsolete local files after Alertmanager is no longer running for removed or resharded user. #3910 ## 1.8.0 in progress From a39b3ad5335810e32e52424e72494d47f0eadf13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Thu, 4 Mar 2021 16:01:35 +0100 Subject: [PATCH 03/14] Comment. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- pkg/alertmanager/alertmanager.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pkg/alertmanager/alertmanager.go b/pkg/alertmanager/alertmanager.go index 8b828d97a9..b63ae2783c 100644 --- a/pkg/alertmanager/alertmanager.go +++ b/pkg/alertmanager/alertmanager.go @@ -46,7 +46,7 @@ import ( const ( notificationLogMaintenancePeriod = 15 * time.Minute - // Per-tenant files have these prefixes. + // Per-tenant local files have these prefixes. nflogPrefix = "nflog:" silencesPrefix = "silences:" ) @@ -167,9 +167,9 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) { am.marker = types.NewMarker(am.registry) - silencesID := silencesPrefix + cfg.UserID + silencesFile := filepath.Join(cfg.DataDir, silencesPrefix+cfg.UserID) am.silences, err = silence.New(silence.Options{ - SnapshotFile: filepath.Join(cfg.DataDir, silencesID), + SnapshotFile: silencesFile, Retention: cfg.Retention, Logger: log.With(am.logger, "component", "silences"), Metrics: am.registry, @@ -185,7 +185,7 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) { am.wg.Add(1) go func() { - am.silences.Maintenance(15*time.Minute, filepath.Join(cfg.DataDir, silencesID), am.stop) + am.silences.Maintenance(notificationLogMaintenancePeriod, silencesFile, am.stop) am.wg.Done() }() From 27d19cb32d02185300a2bf9dac85f1c7613f4462 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Thu, 4 Mar 2021 16:03:05 +0100 Subject: [PATCH 04/14] Don't ignore directories. Log error when deletion fails instead. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- pkg/alertmanager/multitenant.go | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index 41d9968d05..fd8674ac3f 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -986,10 +986,6 @@ func (am *MultitenantAlertmanager) getSnapshotFilesPerUser() map[string][]string result := map[string][]string{} for _, f := range files { - if f.IsDir() { - continue - } - fullPath := filepath.Join(am.cfg.DataDir, f.Name()) switch { From 2d85dc6bbce99b95501c0110ec6344ef684673e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Fri, 5 Mar 2021 17:48:09 +0100 Subject: [PATCH 05/14] Address review feedback. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- CHANGELOG.md | 2 +- pkg/alertmanager/alertmanager.go | 13 +++++++------ pkg/alertmanager/multitenant.go | 4 ++-- pkg/alertmanager/multitenant_test.go | 4 ++-- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 962be275a9..36f3300777 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,11 +2,11 @@ ## master / unreleased +* [CHANGE] Alertmanager: clean obsolete local files after Alertmanager is no longer running for removed or resharded user. #3910 * [ENHANCEMENT] Ruler: optimized `/api/v1/rules` and `/api/v1/alerts` when ruler sharding is enabled. #3916 * [ENHANCEMENT] Ruler: added the following metrics when ruler sharding is enabled: #3916 * `cortex_ruler_clients` * `cortex_ruler_client_request_duration_seconds` -* [ENHANCEMENT] Alertmanager: cleanup obsolete local files after Alertmanager is no longer running for removed or resharded user. #3910 ## 1.8.0 in progress diff --git a/pkg/alertmanager/alertmanager.go b/pkg/alertmanager/alertmanager.go index b63ae2783c..f6d6b23514 100644 --- a/pkg/alertmanager/alertmanager.go +++ b/pkg/alertmanager/alertmanager.go @@ -44,11 +44,12 @@ import ( ) const ( - notificationLogMaintenancePeriod = 15 * time.Minute + // MaintenancePeriod is used for periodic storing of silences and notifications to local file. + maintenancePeriod = 15 * time.Minute // Per-tenant local files have these prefixes. - nflogPrefix = "nflog:" - silencesPrefix = "silences:" + notificationLogPrefix = "nflog:" + silencesPrefix = "silences:" ) // Config configures an Alertmanager. @@ -153,8 +154,8 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) { var err error am.nflog, err = nflog.New( nflog.WithRetention(cfg.Retention), - nflog.WithSnapshot(filepath.Join(cfg.DataDir, nflogPrefix+cfg.UserID)), - nflog.WithMaintenance(notificationLogMaintenancePeriod, am.stop, am.wg.Done), + nflog.WithSnapshot(filepath.Join(cfg.DataDir, notificationLogPrefix+cfg.UserID)), + nflog.WithMaintenance(maintenancePeriod, am.stop, am.wg.Done), nflog.WithMetrics(am.registry), nflog.WithLogger(log.With(am.logger, "component", "nflog")), ) @@ -185,7 +186,7 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) { am.wg.Add(1) go func() { - am.silences.Maintenance(notificationLogMaintenancePeriod, silencesFile, am.stop) + am.silences.Maintenance(maintenancePeriod, silencesFile, am.stop) am.wg.Done() }() diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index fd8674ac3f..586417ade2 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -989,8 +989,8 @@ func (am *MultitenantAlertmanager) getSnapshotFilesPerUser() map[string][]string fullPath := filepath.Join(am.cfg.DataDir, f.Name()) switch { - case strings.HasPrefix(f.Name(), nflogPrefix): - userID := strings.TrimPrefix(f.Name(), nflogPrefix) + case strings.HasPrefix(f.Name(), notificationLogPrefix): + userID := strings.TrimPrefix(f.Name(), notificationLogPrefix) result[userID] = append(result[userID], fullPath) case strings.HasPrefix(f.Name(), silencesPrefix): diff --git a/pkg/alertmanager/multitenant_test.go b/pkg/alertmanager/multitenant_test.go index 9c2fcd3217..e18d2be8d7 100644 --- a/pkg/alertmanager/multitenant_test.go +++ b/pkg/alertmanager/multitenant_test.go @@ -216,8 +216,8 @@ func TestMultitenantAlertmanager_deleteObsoleteLocalFiles(t *testing.T) { require.NoError(t, err) testUser1File1 := createFile(t, cfg.DataDir, silencesPrefix+user1) - testUser1File2 := createFile(t, cfg.DataDir, nflogPrefix+user1) - testUser2File := createFile(t, cfg.DataDir, nflogPrefix+user2) + testUser1File2 := createFile(t, cfg.DataDir, notificationLogPrefix+user1) + testUser2File := createFile(t, cfg.DataDir, notificationLogPrefix+user2) files := am.getSnapshotFilesPerUser() require.Equal(t, 2, len(files)) From 0f5c25a9f362aca6ed7a16a56eeb6cb8779680d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Mon, 8 Mar 2021 12:20:47 +0100 Subject: [PATCH 06/14] Move per-tenant state into tenant directory to simplify cleanup. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- pkg/alertmanager/alertmanager.go | 23 +++-- pkg/alertmanager/multitenant.go | 159 ++++++++++++++++++++++++++----- 2 files changed, 149 insertions(+), 33 deletions(-) diff --git a/pkg/alertmanager/alertmanager.go b/pkg/alertmanager/alertmanager.go index f6d6b23514..d1f5e6ec87 100644 --- a/pkg/alertmanager/alertmanager.go +++ b/pkg/alertmanager/alertmanager.go @@ -47,22 +47,23 @@ const ( // MaintenancePeriod is used for periodic storing of silences and notifications to local file. maintenancePeriod = 15 * time.Minute - // Per-tenant local files have these prefixes. - notificationLogPrefix = "nflog:" - silencesPrefix = "silences:" + notificationLogSnapshot = "notifications" + silencesSnapshot = "silences" + templatesDir = "templates" ) // Config configures an Alertmanager. type Config struct { - UserID string - // Used to persist notification logs and silences on disk. - DataDir string + UserID string Logger log.Logger Peer *cluster.Peer PeerTimeout time.Duration Retention time.Duration ExternalURL *url.URL + // Tenant-specific local directory where AM can store its state (notifications, silences, templates). When AM is stopped, entire dir is removed. + TenantDataDir string + ShardingEnabled bool ReplicationFactor int ReplicateStateFunc func(context.Context, string, *clusterpb.Part) error @@ -122,6 +123,10 @@ type State interface { // New creates a new Alertmanager. func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) { + if cfg.TenantDataDir == "" { + return nil, fmt.Errorf("directory for tenant-specific AlertManager is not configured") + } + am := &Alertmanager{ cfg: cfg, logger: log.With(cfg.Logger, "user", cfg.UserID), @@ -154,7 +159,7 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) { var err error am.nflog, err = nflog.New( nflog.WithRetention(cfg.Retention), - nflog.WithSnapshot(filepath.Join(cfg.DataDir, notificationLogPrefix+cfg.UserID)), + nflog.WithSnapshot(filepath.Join(cfg.TenantDataDir, notificationLogSnapshot)), nflog.WithMaintenance(maintenancePeriod, am.stop, am.wg.Done), nflog.WithMetrics(am.registry), nflog.WithLogger(log.With(am.logger, "component", "nflog")), @@ -168,7 +173,7 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) { am.marker = types.NewMarker(am.registry) - silencesFile := filepath.Join(cfg.DataDir, silencesPrefix+cfg.UserID) + silencesFile := filepath.Join(cfg.TenantDataDir, silencesSnapshot) am.silences, err = silence.New(silence.Options{ SnapshotFile: silencesFile, Retention: cfg.Retention, @@ -246,7 +251,7 @@ func (am *Alertmanager) ApplyConfig(userID string, conf *config.Config, rawCfg s templateFiles := make([]string, len(conf.Templates)) if len(conf.Templates) > 0 { for i, t := range conf.Templates { - templateFiles[i] = filepath.Join(am.cfg.DataDir, "templates", userID, t) + templateFiles[i] = filepath.Join(am.cfg.TenantDataDir, templatesDir, t) } } diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index 586417ade2..6ebbf0c54d 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -448,6 +448,46 @@ func (h *handlerForGRPCServer) ServeHTTP(w http.ResponseWriter, req *http.Reques } func (am *MultitenantAlertmanager) starting(ctx context.Context) (err error) { + // Migrate any existing configuration from old format to new one. + { + st, err := am.getObsoleteFilesPerUser() + if err != nil { + return errors.Wrap(err, "failed to migrate existing files") + } + + for userID, files := range st { + tenantDir := filepath.Join(am.cfg.DataDir, userID) + err := os.MkdirAll(tenantDir, 0777) + if err != nil { + return errors.Wrapf(err, "failed to create per-tenant directory %v", tenantDir) + } + + if files.notificationLogSnapshot != "" { + target := filepath.Join(tenantDir, notificationLogSnapshot) + err := os.Rename(files.notificationLogSnapshot, target) + if err != nil { + return errors.Wrapf(err, "failed to move notification snapshot from %v to %v", files.notificationLogSnapshot, target) + } + } + + if files.silencesSnapshot != "" { + target := filepath.Join(tenantDir, silencesSnapshot) + err := os.Rename(files.silencesSnapshot, target) + if err != nil { + return errors.Wrapf(err, "failed to move silences snapshot from %v to %v", files.silencesSnapshot, target) + } + } + + if files.templatesDir != "" { + target := filepath.Join(tenantDir, templatesDir) + err := os.Rename(files.templatesDir, target) + if err != nil { + return errors.Wrapf(err, "failed to move templates directory from %v to %v", files.templatesDir, target) + } + } + } + } + defer func() { if err == nil || am.subservices == nil { return @@ -552,7 +592,7 @@ func (am *MultitenantAlertmanager) loadAndSyncConfigs(ctx context.Context, syncR } am.syncConfigs(cfgs) - am.deleteObsoleteLocalFiles() + am.deleteUnusedLocalUserState() return nil } @@ -755,9 +795,15 @@ func (am *MultitenantAlertmanager) setConfig(cfg alertspb.AlertConfigDesc) error func (am *MultitenantAlertmanager) newAlertmanager(userID string, amConfig *amconfig.Config, rawCfg string) (*Alertmanager, error) { reg := prometheus.NewRegistry() + tenantDir := filepath.Join(am.cfg.DataDir, userID) + err := os.MkdirAll(tenantDir, 0777) + if err != nil { + return nil, errors.Wrapf(err, "failed to create per-tenant directory %v", tenantDir) + } + newAM, err := New(&Config{ UserID: userID, - DataDir: am.cfg.DataDir, + TenantDataDir: tenantDir, Logger: util_log.Logger, Peer: am.peer, PeerTimeout: am.cfg.Cluster.PeerTimeout, @@ -954,55 +1000,120 @@ func (am *MultitenantAlertmanager) UpdateState(ctx context.Context, part *cluste return &alertmanagerpb.UpdateStateResponse{Status: alertmanagerpb.OK}, nil } -// deleteObsoleteLocalFiles finds local files that we no longer need, and deletes them. -func (am *MultitenantAlertmanager) deleteObsoleteLocalFiles() { - userFilesToDelete := am.getSnapshotFilesPerUser() - - am.alertmanagersMtx.Lock() - for userID := range am.alertmanagers { - delete(userFilesToDelete, userID) // Don't delete files for users we're managing. - } - am.alertmanagersMtx.Unlock() +// deleteUnusedLocalUserState finds local files that we no longer need, and deletes them. +func (am *MultitenantAlertmanager) deleteUnusedLocalUserState() { + userDirs := am.getPerUserDirectories() // And delete remaining files. - for userID, files := range userFilesToDelete { - for _, fn := range files { - if err := os.Remove(fn); err != nil { - level.Warn(am.logger).Log("msg", "failed to delete local state file for user", "file", fn, "user", userID, "err", err) - } else { - level.Info(am.logger).Log("msg", "deleted local state file for user", "file", fn, "user", userID) - } + for userID, dir := range userDirs { + am.alertmanagersMtx.Lock() + _, exists := am.alertmanagers[userID] + am.alertmanagersMtx.Unlock() + + // Don't delete directory if AM for user still exists. + if exists { + continue + } + + err := os.RemoveAll(dir) + if err != nil { + level.Warn(am.logger).Log("msg", "failed to delete directory for user", "dir", dir, "user", userID, "err", err) + } else { + level.Info(am.logger).Log("msg", "deleted local directory for user", "dir", dir, "user", userID) } } } -func (am *MultitenantAlertmanager) getSnapshotFilesPerUser() map[string][]string { +func (am *MultitenantAlertmanager) getPerUserDirectories() map[string]string { files, err := ioutil.ReadDir(am.cfg.DataDir) if err != nil { - level.Warn(am.logger).Log("msg", "failed to list file in data dir", "dir", am.cfg.DataDir, "err", err) + level.Warn(am.logger).Log("msg", "failed to list local dir", "dir", am.cfg.DataDir, "err", err) return nil } - result := map[string][]string{} + result := map[string]string{} for _, f := range files { fullPath := filepath.Join(am.cfg.DataDir, f.Name()) + if !f.IsDir() { + level.Warn(am.logger).Log("msg", "ignoring local file", "file", fullPath) + continue + } + + result[f.Name()] = fullPath + } + return result +} + +type obsoleteStateFiles struct { + notificationLogSnapshot string + silencesSnapshot string + templatesDir string +} + +// getObsoleteFilesPerUser returns per-user set of files that should be migrated from old structure to new structure. +func (am *MultitenantAlertmanager) getObsoleteFilesPerUser() (map[string]obsoleteStateFiles, error) { + files, err := ioutil.ReadDir(am.cfg.DataDir) + if err != nil { + return nil, errors.Wrapf(err, "failed to list dir %v", am.cfg.DataDir) + } + + const ( + notificationLogPrefix = "nflog:" + silencesPrefix = "silences:" + templates = "templates" + ) + + result := map[string]obsoleteStateFiles{} + + for _, f := range files { + fullPath := filepath.Join(am.cfg.DataDir, f.Name()) + + if f.IsDir() { + // Process templates dir. + if f.Name() != templates { + // Ignore other files -- those are likely per tenant directories. + continue + } + + templateDirs, err := ioutil.ReadDir(fullPath) + if err != nil { + return nil, errors.Wrapf(err, "failed to list dir %v", fullPath) + } + + // Previously templates directory contained per-tenant subdirectory. + for _, d := range templateDirs { + if d.IsDir() { + v := result[d.Name()] + v.templatesDir = filepath.Join(fullPath, d.Name()) + result[d.Name()] = v + } else { + level.Warn(am.logger).Log("msg", "ignoring unknown local file", "file", filepath.Join(fullPath, d.Name())) + } + } + continue + } + switch { case strings.HasPrefix(f.Name(), notificationLogPrefix): userID := strings.TrimPrefix(f.Name(), notificationLogPrefix) - result[userID] = append(result[userID], fullPath) + v := result[userID] + v.notificationLogSnapshot = fullPath + result[userID] = v case strings.HasPrefix(f.Name(), silencesPrefix): userID := strings.TrimPrefix(f.Name(), silencesPrefix) - result[userID] = append(result[userID], fullPath) + v := result[userID] + v.silencesSnapshot = fullPath + result[userID] = v default: level.Warn(am.logger).Log("msg", "ignoring unknown local data file", "file", fullPath) } } - return result + return result, nil } // StatusHandler shows the status of the alertmanager. From 88fb03c80db4bb5604d44f7056460ea1f76a3319 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Mon, 8 Mar 2021 17:47:58 +0100 Subject: [PATCH 07/14] Move migration to separate function. Add test for migration. Fix test for deletion of unused dirs. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- pkg/alertmanager/multitenant.go | 230 ++++++++++++++------------- pkg/alertmanager/multitenant_test.go | 82 ++++++++-- 2 files changed, 188 insertions(+), 124 deletions(-) diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index 6ebbf0c54d..5166e897b9 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -22,6 +22,7 @@ import ( amconfig "github.com/prometheus/alertmanager/config" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" + tsdb_errors "github.com/prometheus/prometheus/tsdb/errors" "github.com/weaveworks/common/httpgrpc" "github.com/weaveworks/common/httpgrpc/server" "github.com/weaveworks/common/user" @@ -448,44 +449,9 @@ func (h *handlerForGRPCServer) ServeHTTP(w http.ResponseWriter, req *http.Reques } func (am *MultitenantAlertmanager) starting(ctx context.Context) (err error) { - // Migrate any existing configuration from old format to new one. - { - st, err := am.getObsoleteFilesPerUser() - if err != nil { - return errors.Wrap(err, "failed to migrate existing files") - } - - for userID, files := range st { - tenantDir := filepath.Join(am.cfg.DataDir, userID) - err := os.MkdirAll(tenantDir, 0777) - if err != nil { - return errors.Wrapf(err, "failed to create per-tenant directory %v", tenantDir) - } - - if files.notificationLogSnapshot != "" { - target := filepath.Join(tenantDir, notificationLogSnapshot) - err := os.Rename(files.notificationLogSnapshot, target) - if err != nil { - return errors.Wrapf(err, "failed to move notification snapshot from %v to %v", files.notificationLogSnapshot, target) - } - } - - if files.silencesSnapshot != "" { - target := filepath.Join(tenantDir, silencesSnapshot) - err := os.Rename(files.silencesSnapshot, target) - if err != nil { - return errors.Wrapf(err, "failed to move silences snapshot from %v to %v", files.silencesSnapshot, target) - } - } - - if files.templatesDir != "" { - target := filepath.Join(tenantDir, templatesDir) - err := os.Rename(files.templatesDir, target) - if err != nil { - return errors.Wrapf(err, "failed to move templates directory from %v to %v", files.templatesDir, target) - } - } - } + err = am.migrateStateFilesToPerTenantDirectories() + if err != nil { + return err } defer func() { @@ -541,6 +507,118 @@ func (am *MultitenantAlertmanager) starting(ctx context.Context) (err error) { return nil } +// migrateStateFilesToPerTenantDirectories migrate any existing configuration from old place to new hierarchy. +func (am *MultitenantAlertmanager) migrateStateFilesToPerTenantDirectories() error { + migrate := func(from, to string) error { + level.Info(am.logger).Log("msg", "migrating AM state", "from", from, "to", to) + err := os.Rename(from, to) + return errors.Wrapf(err, "failed to migrate from %v to %v", from, to) + } + + st, err := am.getObsoleteFilesPerUser() + if err != nil { + return errors.Wrap(err, "failed to migrate existing files") + } + + for userID, files := range st { + tenantDir := filepath.Join(am.cfg.DataDir, userID) + err := os.MkdirAll(tenantDir, 0777) + if err != nil { + return errors.Wrapf(err, "failed to create per-tenant directory %v", tenantDir) + } + + errs := tsdb_errors.NewMulti() + + if files.notificationLogSnapshot != "" { + errs.Add(migrate(files.notificationLogSnapshot, filepath.Join(tenantDir, notificationLogSnapshot))) + } + + if files.silencesSnapshot != "" { + errs.Add(migrate(files.silencesSnapshot, filepath.Join(tenantDir, silencesSnapshot))) + } + + if files.templatesDir != "" { + errs.Add(migrate(files.templatesDir, filepath.Join(tenantDir, templatesDir))) + } + + if err := errs.Err(); err != nil { + return err + } + } + return nil +} + +type obsoleteStateFiles struct { + notificationLogSnapshot string + silencesSnapshot string + templatesDir string +} + +// getObsoleteFilesPerUser returns per-user set of files that should be migrated from old structure to new structure. +func (am *MultitenantAlertmanager) getObsoleteFilesPerUser() (map[string]obsoleteStateFiles, error) { + files, err := ioutil.ReadDir(am.cfg.DataDir) + if err != nil { + return nil, errors.Wrapf(err, "failed to list dir %v", am.cfg.DataDir) + } + + // old names + const ( + notificationLogPrefix = "nflog:" + silencesPrefix = "silences:" + templates = "templates" + ) + + result := map[string]obsoleteStateFiles{} + + for _, f := range files { + fullPath := filepath.Join(am.cfg.DataDir, f.Name()) + + if f.IsDir() { + // Process templates dir. + if f.Name() != templates { + // Ignore other files -- those are likely per tenant directories. + continue + } + + templateDirs, err := ioutil.ReadDir(fullPath) + if err != nil { + return nil, errors.Wrapf(err, "failed to list dir %v", fullPath) + } + + // Previously templates directory contained per-tenant subdirectory. + for _, d := range templateDirs { + if d.IsDir() { + v := result[d.Name()] + v.templatesDir = filepath.Join(fullPath, d.Name()) + result[d.Name()] = v + } else { + level.Warn(am.logger).Log("msg", "ignoring unknown local file", "file", filepath.Join(fullPath, d.Name())) + } + } + continue + } + + switch { + case strings.HasPrefix(f.Name(), notificationLogPrefix): + userID := strings.TrimPrefix(f.Name(), notificationLogPrefix) + v := result[userID] + v.notificationLogSnapshot = fullPath + result[userID] = v + + case strings.HasPrefix(f.Name(), silencesPrefix): + userID := strings.TrimPrefix(f.Name(), silencesPrefix) + v := result[userID] + v.silencesSnapshot = fullPath + result[userID] = v + + default: + level.Warn(am.logger).Log("msg", "ignoring unknown local data file", "file", fullPath) + } + } + + return result, nil +} + func (am *MultitenantAlertmanager) run(ctx context.Context) error { tick := time.NewTicker(am.cfg.PollInterval) defer tick.Stop() @@ -1000,18 +1078,18 @@ func (am *MultitenantAlertmanager) UpdateState(ctx context.Context, part *cluste return &alertmanagerpb.UpdateStateResponse{Status: alertmanagerpb.OK}, nil } -// deleteUnusedLocalUserState finds local files that we no longer need, and deletes them. +// deleteUnusedLocalUserState deletes local files for users that we no longer need. func (am *MultitenantAlertmanager) deleteUnusedLocalUserState() { userDirs := am.getPerUserDirectories() // And delete remaining files. for userID, dir := range userDirs { am.alertmanagersMtx.Lock() - _, exists := am.alertmanagers[userID] + userAM := am.alertmanagers[userID] am.alertmanagersMtx.Unlock() // Don't delete directory if AM for user still exists. - if exists { + if userAM != nil { continue } @@ -1046,76 +1124,6 @@ func (am *MultitenantAlertmanager) getPerUserDirectories() map[string]string { return result } -type obsoleteStateFiles struct { - notificationLogSnapshot string - silencesSnapshot string - templatesDir string -} - -// getObsoleteFilesPerUser returns per-user set of files that should be migrated from old structure to new structure. -func (am *MultitenantAlertmanager) getObsoleteFilesPerUser() (map[string]obsoleteStateFiles, error) { - files, err := ioutil.ReadDir(am.cfg.DataDir) - if err != nil { - return nil, errors.Wrapf(err, "failed to list dir %v", am.cfg.DataDir) - } - - const ( - notificationLogPrefix = "nflog:" - silencesPrefix = "silences:" - templates = "templates" - ) - - result := map[string]obsoleteStateFiles{} - - for _, f := range files { - fullPath := filepath.Join(am.cfg.DataDir, f.Name()) - - if f.IsDir() { - // Process templates dir. - if f.Name() != templates { - // Ignore other files -- those are likely per tenant directories. - continue - } - - templateDirs, err := ioutil.ReadDir(fullPath) - if err != nil { - return nil, errors.Wrapf(err, "failed to list dir %v", fullPath) - } - - // Previously templates directory contained per-tenant subdirectory. - for _, d := range templateDirs { - if d.IsDir() { - v := result[d.Name()] - v.templatesDir = filepath.Join(fullPath, d.Name()) - result[d.Name()] = v - } else { - level.Warn(am.logger).Log("msg", "ignoring unknown local file", "file", filepath.Join(fullPath, d.Name())) - } - } - continue - } - - switch { - case strings.HasPrefix(f.Name(), notificationLogPrefix): - userID := strings.TrimPrefix(f.Name(), notificationLogPrefix) - v := result[userID] - v.notificationLogSnapshot = fullPath - result[userID] = v - - case strings.HasPrefix(f.Name(), silencesPrefix): - userID := strings.TrimPrefix(f.Name(), silencesPrefix) - v := result[userID] - v.silencesSnapshot = fullPath - result[userID] = v - - default: - level.Warn(am.logger).Log("msg", "ignoring unknown local data file", "file", fullPath) - } - } - - return result, nil -} - // StatusHandler shows the status of the alertmanager. type StatusHandler struct { am *MultitenantAlertmanager diff --git a/pkg/alertmanager/multitenant_test.go b/pkg/alertmanager/multitenant_test.go index e18d2be8d7..b6d0a19721 100644 --- a/pkg/alertmanager/multitenant_test.go +++ b/pkg/alertmanager/multitenant_test.go @@ -161,6 +161,10 @@ func TestMultitenantAlertmanager_loadAndSyncConfigs(t *testing.T) { _, exists = am.alertmanagers["user3"] require.False(t, exists) + dirs := am.getPerUserDirectories() + require.NotZero(t, dirs["user1"]) + require.NotZero(t, dirs["user2"]) + require.Zero(t, dirs["user3"]) // User3 is deleted, so we should have no more files for it. assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` # HELP cortex_alertmanager_config_last_reload_successful Boolean set to 1 whenever the last configuration reload attempt was successful. @@ -185,6 +189,10 @@ func TestMultitenantAlertmanager_loadAndSyncConfigs(t *testing.T) { _, exists = am.alertmanagers["user3"] require.True(t, exists) + dirs = am.getPerUserDirectories() + require.NotZero(t, dirs["user1"]) + require.NotZero(t, dirs["user2"]) + require.NotZero(t, dirs["user3"]) // Dir should exist, even though state files are not generated yet. assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` # HELP cortex_alertmanager_config_last_reload_successful Boolean set to 1 whenever the last configuration reload attempt was successful. @@ -195,7 +203,7 @@ func TestMultitenantAlertmanager_loadAndSyncConfigs(t *testing.T) { `), "cortex_alertmanager_config_last_reload_successful")) } -func TestMultitenantAlertmanager_deleteObsoleteLocalFiles(t *testing.T) { +func TestMultitenantAlertmanager_migrateStateFilesToPerTenantDirectories(t *testing.T) { ctx := context.Background() const ( @@ -215,28 +223,76 @@ func TestMultitenantAlertmanager_deleteObsoleteLocalFiles(t *testing.T) { am, err := createMultitenantAlertmanager(cfg, nil, nil, store, nil, log.NewNopLogger(), reg) require.NoError(t, err) - testUser1File1 := createFile(t, cfg.DataDir, silencesPrefix+user1) - testUser1File2 := createFile(t, cfg.DataDir, notificationLogPrefix+user1) - testUser2File := createFile(t, cfg.DataDir, notificationLogPrefix+user2) + createFile(t, filepath.Join(cfg.DataDir, "nflog:"+user1)) + createFile(t, filepath.Join(cfg.DataDir, "silences:"+user1)) + createFile(t, filepath.Join(cfg.DataDir, "nflog:"+user2)) + createFile(t, filepath.Join(cfg.DataDir, "templates", user2, "template.tpl")) + + require.NoError(t, am.migrateStateFilesToPerTenantDirectories()) + require.True(t, exists(t, filepath.Join(cfg.DataDir, user1, notificationLogSnapshot), false)) + require.True(t, exists(t, filepath.Join(cfg.DataDir, user1, silencesSnapshot), false)) + require.True(t, exists(t, filepath.Join(cfg.DataDir, user2, notificationLogSnapshot), false)) + require.True(t, exists(t, filepath.Join(cfg.DataDir, user2, templatesDir), true)) + require.True(t, exists(t, filepath.Join(cfg.DataDir, user2, templatesDir, "template.tpl"), false)) +} + +func exists(t *testing.T, path string, dir bool) bool { + fi, err := os.Stat(path) + if err != nil { + if os.IsNotExist(err) { + return false + } + require.NoError(t, err) + } + + require.Equal(t, dir, fi.IsDir()) + return true +} + +func TestMultitenantAlertmanager_deleteUnusedLocalUserState(t *testing.T) { + ctx := context.Background() + + const ( + user1 = "user1" + user2 = "user2" + ) + + store := prepareInMemoryAlertStore() + require.NoError(t, store.SetAlertConfig(ctx, alertspb.AlertConfigDesc{ + User: user2, + RawConfig: simpleConfigOne, + Templates: []*alertspb.TemplateDesc{}, + })) + + reg := prometheus.NewPedanticRegistry() + cfg := mockAlertmanagerConfig(t) + am, err := createMultitenantAlertmanager(cfg, nil, nil, store, nil, log.NewNopLogger(), reg) + require.NoError(t, err) + + createFile(t, filepath.Join(cfg.DataDir, user1, notificationLogSnapshot)) + createFile(t, filepath.Join(cfg.DataDir, user1, silencesSnapshot)) + createFile(t, filepath.Join(cfg.DataDir, user2, notificationLogSnapshot)) + createFile(t, filepath.Join(cfg.DataDir, user2, templatesDir, "template.tpl")) - files := am.getSnapshotFilesPerUser() - require.Equal(t, 2, len(files)) - require.ElementsMatch(t, []string{testUser1File1, testUser1File2}, files[user1]) - require.ElementsMatch(t, []string{testUser2File}, files[user2]) + dirs := am.getPerUserDirectories() + require.Equal(t, 2, len(dirs)) + require.NotZero(t, dirs[user1]) + require.NotZero(t, dirs[user2]) // Ensure the configs are synced correctly err = am.loadAndSyncConfigs(context.Background(), reasonPeriodic) require.NoError(t, err) // loadAndSyncConfigs also cleans up obsolete files. Let's verify that. - files = am.getSnapshotFilesPerUser() + dirs = am.getPerUserDirectories() - require.Nil(t, files[user1]) // has no configuration, files were deleted - require.NotNil(t, files[user2]) // has config, files survived + require.Zero(t, dirs[user1]) // has no configuration, files were deleted + require.NotZero(t, dirs[user2]) // has config, files survived } -func createFile(t *testing.T, dir string, name string) string { - path := filepath.Join(dir, name) +func createFile(t *testing.T, path string) string { + dir := filepath.Dir(path) + require.NoError(t, os.MkdirAll(dir, 0777)) f, err := os.Create(path) require.NoError(t, err) require.NoError(t, f.Close()) From 1fb06cad07553cbce562df846f3ffcb1d1e8541a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Mon, 8 Mar 2021 18:15:00 +0100 Subject: [PATCH 08/14] Store templates to correct place. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- pkg/alertmanager/alertmanager.go | 1 + pkg/alertmanager/api.go | 8 +++---- pkg/alertmanager/multitenant.go | 25 ++++++++++++--------- pkg/alertmanager/multitenant_test.go | 33 ++++++++++++++++++++++++++++ 4 files changed, 53 insertions(+), 14 deletions(-) diff --git a/pkg/alertmanager/alertmanager.go b/pkg/alertmanager/alertmanager.go index d1f5e6ec87..4e84db0d9e 100644 --- a/pkg/alertmanager/alertmanager.go +++ b/pkg/alertmanager/alertmanager.go @@ -47,6 +47,7 @@ const ( // MaintenancePeriod is used for periodic storing of silences and notifications to local file. maintenancePeriod = 15 * time.Minute + // Filenames used within tenant-directory notificationLogSnapshot = "notifications" silencesSnapshot = "silences" templatesDir = "templates" diff --git a/pkg/alertmanager/api.go b/pkg/alertmanager/api.go index 95b2a9d2e7..74c8dbaf85 100644 --- a/pkg/alertmanager/api.go +++ b/pkg/alertmanager/api.go @@ -153,14 +153,14 @@ func validateUserConfig(logger log.Logger, cfg alertspb.AlertConfigDesc) error { // not to configured data dir, and on the flipside, it'll fail if we can't write // to tmpDir. Ignoring both cases for now as they're ultra rare but will revisit if // we see this in the wild. - tmpDir, err := ioutil.TempDir("", "validate-config") + userTempDir, err := ioutil.TempDir("", "validate-config-"+cfg.User) if err != nil { return err } - defer os.RemoveAll(tmpDir) + defer os.RemoveAll(userTempDir) for _, tmpl := range cfg.Templates { - _, err := createTemplateFile(tmpDir, cfg.User, tmpl.Filename, tmpl.Body) + _, err := storeTemplateFile(userTempDir, tmpl.Filename, tmpl.Body) if err != nil { level.Error(logger).Log("msg", "unable to create template file", "err", err, "user", cfg.User) return fmt.Errorf("unable to create template file '%s'", tmpl.Filename) @@ -169,7 +169,7 @@ func validateUserConfig(logger log.Logger, cfg alertspb.AlertConfigDesc) error { templateFiles := make([]string, len(amCfg.Templates)) for i, t := range amCfg.Templates { - templateFiles[i] = filepath.Join(tmpDir, "templates", cfg.User, t) + templateFiles[i] = filepath.Join(userTempDir, t) } _, err = template.FromGlobs(templateFiles...) diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index 5166e897b9..77e2d1e192 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -521,7 +521,7 @@ func (am *MultitenantAlertmanager) migrateStateFilesToPerTenantDirectories() err } for userID, files := range st { - tenantDir := filepath.Join(am.cfg.DataDir, userID) + tenantDir := am.getTenantDirectory(userID) err := os.MkdirAll(tenantDir, 0777) if err != nil { return errors.Wrapf(err, "failed to create per-tenant directory %v", tenantDir) @@ -788,7 +788,7 @@ func (am *MultitenantAlertmanager) setConfig(cfg alertspb.AlertConfigDesc) error var hasTemplateChanges bool for _, tmpl := range cfg.Templates { - hasChanged, err := createTemplateFile(am.cfg.DataDir, cfg.User, tmpl.Filename, tmpl.Body) + hasChanged, err := storeTemplateFile(filepath.Join(am.getTenantDirectory(cfg.User), templatesDir), tmpl.Filename, tmpl.Body) if err != nil { return err } @@ -870,10 +870,14 @@ func (am *MultitenantAlertmanager) setConfig(cfg alertspb.AlertConfigDesc) error return nil } +func (am *MultitenantAlertmanager) getTenantDirectory(userID string) string { + return filepath.Join(am.cfg.DataDir, userID) +} + func (am *MultitenantAlertmanager) newAlertmanager(userID string, amConfig *amconfig.Config, rawCfg string) (*Alertmanager, error) { reg := prometheus.NewRegistry() - tenantDir := filepath.Join(am.cfg.DataDir, userID) + tenantDir := am.getTenantDirectory(userID) err := os.MkdirAll(tenantDir, 0777) if err != nil { return nil, errors.Wrapf(err, "failed to create per-tenant directory %v", tenantDir) @@ -1137,21 +1141,22 @@ func (s StatusHandler) ServeHTTP(w http.ResponseWriter, _ *http.Request) { } } -func createTemplateFile(dataDir, userID, fn, content string) (bool, error) { - if fn != filepath.Base(fn) { - return false, fmt.Errorf("template file name '%s' is not not valid", fn) +func storeTemplateFile(tenantTemplateDir, templateFileName, content string) (changed bool, _ error) { + if templateFileName != filepath.Base(templateFileName) { + return false, fmt.Errorf("template file name '%s' is not not valid", templateFileName) } - dir := filepath.Join(dataDir, "templates", userID, filepath.Dir(fn)) - err := os.MkdirAll(dir, 0755) + err := os.MkdirAll(tenantTemplateDir, 0755) if err != nil { - return false, fmt.Errorf("unable to create Alertmanager templates directory %q: %s", dir, err) + return false, fmt.Errorf("unable to create Alertmanager templates directory %q: %s", tenantTemplateDir, err) } - file := filepath.Join(dir, fn) + file := filepath.Join(tenantTemplateDir, templateFileName) // Check if the template file already exists and if it has changed if tmpl, err := ioutil.ReadFile(file); err == nil && string(tmpl) == content { return false, nil + } else if err != nil && !os.IsNotExist(err) { + return false, err } if err := ioutil.WriteFile(file, []byte(content), 0644); err != nil { diff --git a/pkg/alertmanager/multitenant_test.go b/pkg/alertmanager/multitenant_test.go index b6d0a19721..c727dd18b3 100644 --- a/pkg/alertmanager/multitenant_test.go +++ b/pkg/alertmanager/multitenant_test.go @@ -1160,3 +1160,36 @@ func TestAlertmanager_StateReplicationWithSharding(t *testing.T) { func prepareInMemoryAlertStore() alertstore.AlertStore { return bucketclient.NewBucketAlertStore(objstore.NewInMemBucket(), nil, log.NewNopLogger()) } + +func TestStoreTemplateFile(t *testing.T) { + tempDir, err := ioutil.TempDir(os.TempDir(), "alertmanager") + require.NoError(t, err) + + t.Cleanup(func() { + require.NoError(t, os.RemoveAll(tempDir)) + }) + + changed, err := storeTemplateFile(templatesDir, "some-template", "content") + require.NoError(t, err) + require.True(t, changed) + + changed, err = storeTemplateFile(templatesDir, "some-template", "new content") + require.NoError(t, err) + require.True(t, changed) + + changed, err = storeTemplateFile(templatesDir, "some-template", "new content") // reusing previous content + require.NoError(t, err) + require.False(t, changed) + + _, err = storeTemplateFile(templatesDir, ".", "content") + require.Error(t, err) + + _, err = storeTemplateFile(templatesDir, "..", "content") + require.Error(t, err) + + _, err = storeTemplateFile(templatesDir, "./test", "content") + require.Error(t, err) + + _, err = storeTemplateFile(templatesDir, "../test", "content") + require.Error(t, err) +} From 27efd7f2ac4579c2965fbe7b4698777b486bb378 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Mon, 8 Mar 2021 20:43:26 +0100 Subject: [PATCH 09/14] CHANGELOG.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 36f3300777..85a83fde2e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,8 @@ ## master / unreleased -* [CHANGE] Alertmanager: clean obsolete local files after Alertmanager is no longer running for removed or resharded user. #3910 +* [CHANGE] Alertmanager now removes local files after Alertmanager is no longer running for removed or resharded user. #3910 +* [CHANGE] Alertmanager now stores local files in per-tenant folders. Files stored by Alertmanager previously are migrated to new hierarchy. #3910 * [ENHANCEMENT] Ruler: optimized `/api/v1/rules` and `/api/v1/alerts` when ruler sharding is enabled. #3916 * [ENHANCEMENT] Ruler: added the following metrics when ruler sharding is enabled: #3916 * `cortex_ruler_clients` From 3ca9ac93746226c3ec395b9e780a515e803c08b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Mon, 8 Mar 2021 21:20:41 +0100 Subject: [PATCH 10/14] Verify that templates are stored properly into correct location. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- pkg/alertmanager/multitenant_test.go | 75 ++++++++++++++++++---------- 1 file changed, 50 insertions(+), 25 deletions(-) diff --git a/pkg/alertmanager/multitenant_test.go b/pkg/alertmanager/multitenant_test.go index c727dd18b3..b60091efbe 100644 --- a/pkg/alertmanager/multitenant_test.go +++ b/pkg/alertmanager/multitenant_test.go @@ -107,8 +107,8 @@ func TestMultitenantAlertmanager_loadAndSyncConfigs(t *testing.T) { require.NoError(t, err) require.Len(t, am.alertmanagers, 2) - currentConfig, exists := am.cfgs["user1"] - require.True(t, exists) + currentConfig, cfgExists := am.cfgs["user1"] + require.True(t, cfgExists) require.Equal(t, simpleConfigOne, currentConfig.RawConfig) assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` @@ -119,16 +119,38 @@ func TestMultitenantAlertmanager_loadAndSyncConfigs(t *testing.T) { `), "cortex_alertmanager_config_last_reload_successful")) // Ensure when a 3rd config is added, it is synced correctly - require.NoError(t, store.SetAlertConfig(ctx, alertspb.AlertConfigDesc{ - User: "user3", - RawConfig: simpleConfigOne, - Templates: []*alertspb.TemplateDesc{}, - })) + user3Cfg := alertspb.AlertConfigDesc{ + User: "user3", + RawConfig: simpleConfigOne + ` +templates: +- 'first.tpl' +- 'second.tpl' +`, + Templates: []*alertspb.TemplateDesc{ + { + Filename: "first.tpl", + Body: `{{ define "t1" }}Template 1 ... {{end}}`, + }, + { + Filename: "second.tpl", + Body: `{{ define "t2" }}Template 2{{ end}}`, + }, + }, + } + require.NoError(t, store.SetAlertConfig(ctx, user3Cfg)) err = am.loadAndSyncConfigs(context.Background(), reasonPeriodic) require.NoError(t, err) require.Len(t, am.alertmanagers, 3) + dirs := am.getPerUserDirectories() + user3Dir := dirs["user3"] + require.NotZero(t, user3Dir) + require.True(t, exists(t, user3Dir, true)) + require.True(t, exists(t, filepath.Join(user3Dir, templatesDir), true)) + require.True(t, exists(t, filepath.Join(user3Dir, templatesDir, "first.tpl"), false)) + require.True(t, exists(t, filepath.Join(user3Dir, templatesDir, "second.tpl"), false)) + assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` # HELP cortex_alertmanager_config_last_reload_successful Boolean set to 1 whenever the last configuration reload attempt was successful. # TYPE cortex_alertmanager_config_last_reload_successful gauge @@ -147,24 +169,25 @@ func TestMultitenantAlertmanager_loadAndSyncConfigs(t *testing.T) { err = am.loadAndSyncConfigs(context.Background(), reasonPeriodic) require.NoError(t, err) - currentConfig, exists = am.cfgs["user1"] - require.True(t, exists) + currentConfig, cfgExists = am.cfgs["user1"] + require.True(t, cfgExists) require.Equal(t, simpleConfigTwo, currentConfig.RawConfig) // Test Delete User, ensure config is removed and the resources are freed. require.NoError(t, store.DeleteAlertConfig(ctx, "user3")) err = am.loadAndSyncConfigs(context.Background(), reasonPeriodic) require.NoError(t, err) - currentConfig, exists = am.cfgs["user3"] - require.False(t, exists) + currentConfig, cfgExists = am.cfgs["user3"] + require.False(t, cfgExists) require.Equal(t, "", currentConfig.RawConfig) - _, exists = am.alertmanagers["user3"] - require.False(t, exists) - dirs := am.getPerUserDirectories() + _, cfgExists = am.alertmanagers["user3"] + require.False(t, cfgExists) + dirs = am.getPerUserDirectories() require.NotZero(t, dirs["user1"]) require.NotZero(t, dirs["user2"]) require.Zero(t, dirs["user3"]) // User3 is deleted, so we should have no more files for it. + require.False(t, exists(t, user3Dir, false)) assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` # HELP cortex_alertmanager_config_last_reload_successful Boolean set to 1 whenever the last configuration reload attempt was successful. @@ -174,25 +197,27 @@ func TestMultitenantAlertmanager_loadAndSyncConfigs(t *testing.T) { `), "cortex_alertmanager_config_last_reload_successful")) // Ensure when a 3rd config is re-added, it is synced correctly - require.NoError(t, store.SetAlertConfig(ctx, alertspb.AlertConfigDesc{ - User: "user3", - RawConfig: simpleConfigOne, - Templates: []*alertspb.TemplateDesc{}, - })) + require.NoError(t, store.SetAlertConfig(ctx, user3Cfg)) err = am.loadAndSyncConfigs(context.Background(), reasonPeriodic) require.NoError(t, err) - currentConfig, exists = am.cfgs["user3"] - require.True(t, exists) - require.Equal(t, simpleConfigOne, currentConfig.RawConfig) + currentConfig, cfgExists = am.cfgs["user3"] + require.True(t, cfgExists) + require.Equal(t, user3Cfg.RawConfig, currentConfig.RawConfig) - _, exists = am.alertmanagers["user3"] - require.True(t, exists) + _, cfgExists = am.alertmanagers["user3"] + require.True(t, cfgExists) dirs = am.getPerUserDirectories() require.NotZero(t, dirs["user1"]) require.NotZero(t, dirs["user2"]) - require.NotZero(t, dirs["user3"]) // Dir should exist, even though state files are not generated yet. + require.Equal(t, user3Dir, dirs["user3"]) // Dir should exist, even though state files are not generated yet. + + // Hierarchy that existed before should exist again. + require.True(t, exists(t, user3Dir, true)) + require.True(t, exists(t, filepath.Join(user3Dir, templatesDir), true)) + require.True(t, exists(t, filepath.Join(user3Dir, templatesDir, "first.tpl"), false)) + require.True(t, exists(t, filepath.Join(user3Dir, templatesDir, "second.tpl"), false)) assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` # HELP cortex_alertmanager_config_last_reload_successful Boolean set to 1 whenever the last configuration reload attempt was successful. From 31ae11882d860cb09ab5bb4bc742450d23dd03dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Mon, 8 Mar 2021 21:32:14 +0100 Subject: [PATCH 11/14] Comments. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- CHANGELOG.md | 2 +- pkg/alertmanager/multitenant.go | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 85a83fde2e..36e54e2c7d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ ## master / unreleased * [CHANGE] Alertmanager now removes local files after Alertmanager is no longer running for removed or resharded user. #3910 -* [CHANGE] Alertmanager now stores local files in per-tenant folders. Files stored by Alertmanager previously are migrated to new hierarchy. #3910 +* [CHANGE] Alertmanager now stores local files in per-tenant folders. Files stored by Alertmanager previously are migrated to new hierarchy. Support for this migration will be removed in Cortex 1.10. #3910 * [ENHANCEMENT] Ruler: optimized `/api/v1/rules` and `/api/v1/alerts` when ruler sharding is enabled. #3916 * [ENHANCEMENT] Ruler: added the following metrics when ruler sharding is enabled: #3916 * `cortex_ruler_clients` diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index 77e2d1e192..1b404992ad 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -507,7 +507,8 @@ func (am *MultitenantAlertmanager) starting(ctx context.Context) (err error) { return nil } -// migrateStateFilesToPerTenantDirectories migrate any existing configuration from old place to new hierarchy. +// migrateStateFilesToPerTenantDirectories migrates any existing configuration from old place to new hierarchy. +// TODO: Remove in Cortex 1.10. func (am *MultitenantAlertmanager) migrateStateFilesToPerTenantDirectories() error { migrate := func(from, to string) error { level.Info(am.logger).Log("msg", "migrating AM state", "from", from, "to", to) @@ -1106,6 +1107,8 @@ func (am *MultitenantAlertmanager) deleteUnusedLocalUserState() { } } +// getPerUserDirectories returns map of users to their directories (full path). Only users with local +// directory are returned. func (am *MultitenantAlertmanager) getPerUserDirectories() map[string]string { files, err := ioutil.ReadDir(am.cfg.DataDir) if err != nil { From de67abbcdc160c421171e3174100839879b7ed71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Mon, 8 Mar 2021 21:36:13 +0100 Subject: [PATCH 12/14] Comments. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- pkg/alertmanager/multitenant.go | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index 1b404992ad..f49fa3b3a1 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -1144,17 +1144,21 @@ func (s StatusHandler) ServeHTTP(w http.ResponseWriter, _ *http.Request) { } } -func storeTemplateFile(tenantTemplateDir, templateFileName, content string) (changed bool, _ error) { +// storeTemplateFile stores template file with given content into specific directory. +// Since templateFileName is provided by end-user, it is verified that it doesn't do any path-traversal. +// Returns true, if file content has changed (new or updated file), false if file with the same name +// and content was already stored locally. +func storeTemplateFile(dir, templateFileName, content string) (bool, error) { if templateFileName != filepath.Base(templateFileName) { return false, fmt.Errorf("template file name '%s' is not not valid", templateFileName) } - err := os.MkdirAll(tenantTemplateDir, 0755) + err := os.MkdirAll(dir, 0755) if err != nil { - return false, fmt.Errorf("unable to create Alertmanager templates directory %q: %s", tenantTemplateDir, err) + return false, fmt.Errorf("unable to create Alertmanager templates directory %q: %s", dir, err) } - file := filepath.Join(tenantTemplateDir, templateFileName) + file := filepath.Join(dir, templateFileName) // Check if the template file already exists and if it has changed if tmpl, err := ioutil.ReadFile(file); err == nil && string(tmpl) == content { return false, nil From 8e944e071ecc278a9cb96275b3aa023b812f8842 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Wed, 10 Mar 2021 17:37:23 +0100 Subject: [PATCH 13/14] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Marco Pracucci Signed-off-by: Peter Štibraný --- pkg/alertmanager/multitenant.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index f49fa3b3a1..e7f4b63365 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -511,14 +511,14 @@ func (am *MultitenantAlertmanager) starting(ctx context.Context) (err error) { // TODO: Remove in Cortex 1.10. func (am *MultitenantAlertmanager) migrateStateFilesToPerTenantDirectories() error { migrate := func(from, to string) error { - level.Info(am.logger).Log("msg", "migrating AM state", "from", from, "to", to) + level.Info(am.logger).Log("msg", "migrating alertmanager state", "from", from, "to", to) err := os.Rename(from, to) - return errors.Wrapf(err, "failed to migrate from %v to %v", from, to) + return errors.Wrapf(err, "failed to migrate alertmanager state from %v to %v", from, to) } st, err := am.getObsoleteFilesPerUser() if err != nil { - return errors.Wrap(err, "failed to migrate existing files") + return errors.Wrap(err, "failed to migrate alertmanager state files") } for userID, files := range st { @@ -593,7 +593,7 @@ func (am *MultitenantAlertmanager) getObsoleteFilesPerUser() (map[string]obsolet v.templatesDir = filepath.Join(fullPath, d.Name()) result[d.Name()] = v } else { - level.Warn(am.logger).Log("msg", "ignoring unknown local file", "file", filepath.Join(fullPath, d.Name())) + level.Warn(am.logger).Log("msg", "ignoring unknown local file while migrating local alertmanager state files", "file", filepath.Join(fullPath, d.Name())) } } continue @@ -613,7 +613,7 @@ func (am *MultitenantAlertmanager) getObsoleteFilesPerUser() (map[string]obsolet result[userID] = v default: - level.Warn(am.logger).Log("msg", "ignoring unknown local data file", "file", fullPath) + level.Warn(am.logger).Log("msg", "ignoring unknown local data file while migrating local alertmanager state files", "file", fullPath) } } @@ -1122,7 +1122,7 @@ func (am *MultitenantAlertmanager) getPerUserDirectories() map[string]string { fullPath := filepath.Join(am.cfg.DataDir, f.Name()) if !f.IsDir() { - level.Warn(am.logger).Log("msg", "ignoring local file", "file", fullPath) + level.Warn(am.logger).Log("msg", "ignoring unexpected file while scanning local alertmanager configs", "file", fullPath) continue } From 674605972185a531393bd3fc8f8138324d562d76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Wed, 10 Mar 2021 17:43:19 +0100 Subject: [PATCH 14/14] Review feedback. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- pkg/alertmanager/multitenant_test.go | 38 +++++++++++++++++----------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/pkg/alertmanager/multitenant_test.go b/pkg/alertmanager/multitenant_test.go index b60091efbe..6f295f294b 100644 --- a/pkg/alertmanager/multitenant_test.go +++ b/pkg/alertmanager/multitenant_test.go @@ -146,10 +146,10 @@ templates: dirs := am.getPerUserDirectories() user3Dir := dirs["user3"] require.NotZero(t, user3Dir) - require.True(t, exists(t, user3Dir, true)) - require.True(t, exists(t, filepath.Join(user3Dir, templatesDir), true)) - require.True(t, exists(t, filepath.Join(user3Dir, templatesDir, "first.tpl"), false)) - require.True(t, exists(t, filepath.Join(user3Dir, templatesDir, "second.tpl"), false)) + require.True(t, dirExists(t, user3Dir)) + require.True(t, dirExists(t, filepath.Join(user3Dir, templatesDir))) + require.True(t, fileExists(t, filepath.Join(user3Dir, templatesDir, "first.tpl"))) + require.True(t, fileExists(t, filepath.Join(user3Dir, templatesDir, "second.tpl"))) assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` # HELP cortex_alertmanager_config_last_reload_successful Boolean set to 1 whenever the last configuration reload attempt was successful. @@ -187,7 +187,7 @@ templates: require.NotZero(t, dirs["user1"]) require.NotZero(t, dirs["user2"]) require.Zero(t, dirs["user3"]) // User3 is deleted, so we should have no more files for it. - require.False(t, exists(t, user3Dir, false)) + require.False(t, fileExists(t, user3Dir)) assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` # HELP cortex_alertmanager_config_last_reload_successful Boolean set to 1 whenever the last configuration reload attempt was successful. @@ -214,10 +214,10 @@ templates: require.Equal(t, user3Dir, dirs["user3"]) // Dir should exist, even though state files are not generated yet. // Hierarchy that existed before should exist again. - require.True(t, exists(t, user3Dir, true)) - require.True(t, exists(t, filepath.Join(user3Dir, templatesDir), true)) - require.True(t, exists(t, filepath.Join(user3Dir, templatesDir, "first.tpl"), false)) - require.True(t, exists(t, filepath.Join(user3Dir, templatesDir, "second.tpl"), false)) + require.True(t, dirExists(t, user3Dir)) + require.True(t, dirExists(t, filepath.Join(user3Dir, templatesDir))) + require.True(t, fileExists(t, filepath.Join(user3Dir, templatesDir, "first.tpl"))) + require.True(t, fileExists(t, filepath.Join(user3Dir, templatesDir, "second.tpl"))) assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` # HELP cortex_alertmanager_config_last_reload_successful Boolean set to 1 whenever the last configuration reload attempt was successful. @@ -254,14 +254,22 @@ func TestMultitenantAlertmanager_migrateStateFilesToPerTenantDirectories(t *test createFile(t, filepath.Join(cfg.DataDir, "templates", user2, "template.tpl")) require.NoError(t, am.migrateStateFilesToPerTenantDirectories()) - require.True(t, exists(t, filepath.Join(cfg.DataDir, user1, notificationLogSnapshot), false)) - require.True(t, exists(t, filepath.Join(cfg.DataDir, user1, silencesSnapshot), false)) - require.True(t, exists(t, filepath.Join(cfg.DataDir, user2, notificationLogSnapshot), false)) - require.True(t, exists(t, filepath.Join(cfg.DataDir, user2, templatesDir), true)) - require.True(t, exists(t, filepath.Join(cfg.DataDir, user2, templatesDir, "template.tpl"), false)) + require.True(t, fileExists(t, filepath.Join(cfg.DataDir, user1, notificationLogSnapshot))) + require.True(t, fileExists(t, filepath.Join(cfg.DataDir, user1, silencesSnapshot))) + require.True(t, fileExists(t, filepath.Join(cfg.DataDir, user2, notificationLogSnapshot))) + require.True(t, dirExists(t, filepath.Join(cfg.DataDir, user2, templatesDir))) + require.True(t, fileExists(t, filepath.Join(cfg.DataDir, user2, templatesDir, "template.tpl"))) } -func exists(t *testing.T, path string, dir bool) bool { +func fileExists(t *testing.T, path string) bool { + return checkExists(t, path, false) +} + +func dirExists(t *testing.T, path string) bool { + return checkExists(t, path, true) +} + +func checkExists(t *testing.T, path string, dir bool) bool { fi, err := os.Stat(path) if err != nil { if os.IsNotExist(err) {