Skip to content

Commit

Permalink
Address feedback
Browse files Browse the repository at this point in the history
Signed-off-by: Goutham Veeramachaneni <gouthamve@gmail.com>
  • Loading branch information
gouthamve committed Aug 24, 2020
1 parent 950221f commit c3727b3
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 27 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Expand Up @@ -12,6 +12,7 @@
* [CHANGE] Experimental Delete Series: `/api/v1/admin/tsdb/delete_series` and `/api/v1/admin/tsdb/cancel_delete_request` purger APIs to return status code `204` instead of `200` for success. #2946
* [CHANGE] Histogram `cortex_memcache_request_duration_seconds` `method` label value changes from `Memcached.Get` to `Memcached.GetBatched` for batched lookups, and is not reported for non-batched lookups (label value `Memcached.GetMulti` remains, and had exactly the same value as `Get` in nonbatched lookups). The same change applies to tracing spans. #3046
* [CHANGE] TLS server validation is now enabled by default, a new parameter `tls_insecure_skip_verify` can be set to true to skip validation optionally. #3030
* [CHANGE] `cortex_ruler_config_update_failures_total` has been removed in favor of `cortex_ruler_config_last_reload_successful`. #3056
* [ENHANCEMENT] Add support for azure storage in China, German and US Government environments. #2988
* [ENHANCEMENT] Query-tee: added a small tolerance to floating point sample values comparison. #2994
* [ENHANCEMENT] Query-tee: add support for doing a passthrough of requests to preferred backend for unregistered routes #3018
Expand Down
57 changes: 30 additions & 27 deletions pkg/ruler/manager.go
Expand Up @@ -20,24 +20,6 @@ import (
"github.com/cortexproject/cortex/pkg/util"
)

var (
configUpdatesTotal = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: "cortex",
Name: "ruler_config_updates_total",
Help: "Total number of config updates triggered by a user",
}, []string{"user"})
configUpdateFailuresTotal = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: "cortex",
Name: "ruler_config_update_failures_total",
Help: "Total number of config update failures triggered by a user",
}, []string{"user", "reason"})
userManagerFailed = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "cortex",
Name: "ruler_manager_failed",
Help: "Boolean set to 1 whenever the Ruler manager failed to start for a user.",
}, []string{"user"})
)

type DefaultMultiTenantManager struct {
cfg Config
notifierCfg *config.Config
Expand All @@ -55,9 +37,12 @@ type DefaultMultiTenantManager struct {
notifiersMtx sync.Mutex
notifiers map[string]*rulerNotifier

managersTotal prometheus.Gauge
registry prometheus.Registerer
logger log.Logger
managersTotal prometheus.Gauge
lastReloadSuccessful *prometheus.GaugeVec
lastReloadSuccessfulTimestamp *prometheus.GaugeVec
configUpdatesTotal *prometheus.CounterVec
registry prometheus.Registerer
logger log.Logger
}

func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, reg prometheus.Registerer, logger log.Logger) (*DefaultMultiTenantManager, error) {
Expand All @@ -84,6 +69,21 @@ func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, reg
Name: "ruler_managers_total",
Help: "Total number of managers registered and running in the ruler",
}),
lastReloadSuccessful: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
Namespace: "cortex",
Name: "ruler_config_last_reload_successful",
Help: "Boolean set to 1 whenever the last configuration reload attempt was successful.",
}, []string{"user"}),
lastReloadSuccessfulTimestamp: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
Namespace: "cortex",
Name: "ruler_config_last_reload_successful_seconds",
Help: "Timestamp of the last successful configuration reload.",
}, []string{"user"}),
configUpdatesTotal: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
Namespace: "cortex",
Name: "ruler_config_updates_total",
Help: "Total number of config updates triggered by a user",
}, []string{"user"}),
registry: reg,
logger: logger,
}, nil
Expand All @@ -104,6 +104,9 @@ func (r *DefaultMultiTenantManager) SyncRuleGroups(ctx context.Context, ruleGrou
if _, exists := ruleGroups[userID]; !exists {
go mngr.Stop()
delete(r.userManagers, userID)
r.lastReloadSuccessful.DeleteLabelValues(userID)
r.lastReloadSuccessfulTimestamp.DeleteLabelValues(userID)
r.configUpdatesTotal.DeleteLabelValues(userID)
level.Info(r.logger).Log("msg", "deleting rule manager", "user", userID)
}
}
Expand All @@ -118,19 +121,19 @@ func (r *DefaultMultiTenantManager) syncRulesToManager(ctx context.Context, user
// have been updated
update, files, err := r.mapper.MapRules(user, groups.Formatted())
if err != nil {
r.lastReloadSuccessful.WithLabelValues(user).Set(0)
level.Error(r.logger).Log("msg", "unable to map rule files", "user", user, "err", err)
return
}

if update {
level.Debug(r.logger).Log("msg", "updating rules", "user", "user")
configUpdatesTotal.WithLabelValues(user).Inc()
r.configUpdatesTotal.WithLabelValues(user).Inc()
manager, exists := r.userManagers[user]
if !exists {
manager, err = r.newManager(ctx, user)
if err != nil {
configUpdateFailuresTotal.WithLabelValues(user, "rule-manager-creation-failure").Inc()
userManagerFailed.WithLabelValues(user).Set(1)
r.lastReloadSuccessful.WithLabelValues(user).Set(0)
level.Error(r.logger).Log("msg", "unable to create rule manager", "user", user, "err", err)
return
}
Expand All @@ -141,13 +144,13 @@ func (r *DefaultMultiTenantManager) syncRulesToManager(ctx context.Context, user
}
err = manager.Update(r.cfg.EvaluationInterval, files, nil)
if err != nil {
configUpdateFailuresTotal.WithLabelValues(user, "rules-update-failure").Inc()
userManagerFailed.WithLabelValues(user).Set(1)
r.lastReloadSuccessful.WithLabelValues(user).Set(0)
level.Error(r.logger).Log("msg", "unable to update rule manager", "user", user, "err", err)
return
}

userManagerFailed.WithLabelValues(user).Set(0)
r.lastReloadSuccessful.WithLabelValues(user).Set(1)
r.lastReloadSuccessfulTimestamp.WithLabelValues(user).SetToCurrentTime()
}
}

Expand Down

0 comments on commit c3727b3

Please sign in to comment.