From c683ec4994e36dab8e9f1e06ec33963ad8b7ee3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Fri, 16 Dec 2022 10:46:25 +0100 Subject: [PATCH 1/2] Fix panic during startup caused by gossip message arriving too soon --- CHANGELOG.md | 1 + go.mod | 2 +- go.sum | 4 +-- vendor/github.com/grafana/dskit/gate/gate.go | 3 +- .../dskit/kv/memberlist/memberlist_client.go | 28 +++++++++++++------ vendor/modules.txt | 2 +- 6 files changed, 25 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4f0b1ed7ad9..b930520071e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ * [BUGFIX] Alertmanager: Fix template spurious deletion with relative data dir. #3604 * [BUGFIX] Security: update prometheus/exporter-toolkit for CVE-2022-46146. #3675 * [BUGFIX] Debian package: Fix post-install, environment file path and user creation. #3720 +* [BUGFIX] memberlist: Fix panic during Mimir startup when Mimir receives gossip message before it's ready. ### Mixin diff --git a/go.mod b/go.mod index cf62b395450..678f29a0d34 100644 --- a/go.mod +++ b/go.mod @@ -16,7 +16,7 @@ require ( github.com/golang/snappy v0.0.4 github.com/google/gopacket v1.1.19 github.com/gorilla/mux v1.8.0 - github.com/grafana/dskit v0.0.0-20221213192733-6a4490513ed8 + github.com/grafana/dskit v0.0.0-20221216094413-a9826f9b0468 github.com/grafana/e2e v0.1.1-0.20221115065638-fe4609fcbc71 github.com/hashicorp/golang-lru v0.5.4 github.com/json-iterator/go v1.1.12 diff --git a/go.sum b/go.sum index 5f4a9e8f363..73d103e7ec2 100644 --- a/go.sum +++ b/go.sum @@ -489,8 +489,8 @@ github.com/gosimple/slug v1.1.1 h1:fRu/digW+NMwBIP+RmviTK97Ho/bEj/C9swrCspN3D4= github.com/gosimple/slug v1.1.1/go.mod h1:ER78kgg1Mv0NQGlXiDe57DpCyfbNywXXZ9mIorhxAf0= github.com/grafana-tools/sdk v0.0.0-20211220201350-966b3088eec9 h1:LQAhgcUPnzdjU/OjCJaLlPQI7NmQCRlfjMPSA1VegvA= github.com/grafana-tools/sdk v0.0.0-20211220201350-966b3088eec9/go.mod h1:AHHlOEv1+GGQ3ktHMlhuTUwo3zljV3QJbC0+8o2kn+4= -github.com/grafana/dskit v0.0.0-20221213192733-6a4490513ed8 h1:xVld//+NUMsvMNJ/pcWYHbf6MNLMpW4OQVenjfNsO8U= -github.com/grafana/dskit v0.0.0-20221213192733-6a4490513ed8/go.mod h1:gNzHl84UZNg3QRc2vPbwHXGqMs6kQeVX+4U9jB3gC+k= +github.com/grafana/dskit v0.0.0-20221216094413-a9826f9b0468 h1:j+DoI0Lu4XAr2l9FcSoHIe4H2ht9lF2YMIgiXvi8/Ug= +github.com/grafana/dskit v0.0.0-20221216094413-a9826f9b0468/go.mod h1:urdZrnf7PIsHlv7ptjY2gS34HuR98x8Oe0uOC1VYnto= github.com/grafana/e2e v0.1.1-0.20221115065638-fe4609fcbc71 h1:sAMF3CGAtlWLimTyvsO46Slwu1p6Fm5EMB64XzG5btI= github.com/grafana/e2e v0.1.1-0.20221115065638-fe4609fcbc71/go.mod h1:3UsooRp7yW5/NJQBlXcTsAHOoykEhNUYXkQ3r6ehEEY= github.com/grafana/gomemcache v0.0.0-20220812141943-44b6cde200bb h1:CqfZjjd8iK3G1TV8Wf0u7WTY+0RxIEbmcgxftt9qVtw= diff --git a/vendor/github.com/grafana/dskit/gate/gate.go b/vendor/github.com/grafana/dskit/gate/gate.go index db799945f23..6b2ff2461b4 100644 --- a/vendor/github.com/grafana/dskit/gate/gate.go +++ b/vendor/github.com/grafana/dskit/gate/gate.go @@ -8,7 +8,6 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" - promgate "github.com/prometheus/prometheus/util/gate" ) var ErrMaxConcurrent = errors.New("max concurrent requests inflight") @@ -49,7 +48,7 @@ func (noopGate) Done() {} // It can be called several times but not with the same registerer otherwise it // will panic when trying to register the same metric multiple times. func New(reg prometheus.Registerer, maxConcurrent int) Gate { - return NewInstrumented(reg, maxConcurrent, promgate.New(maxConcurrent)) + return NewInstrumented(reg, maxConcurrent, NewBlocking(maxConcurrent)) } // NewInstrumented wraps a Gate implementation with one that records max number of inflight diff --git a/vendor/github.com/grafana/dskit/kv/memberlist/memberlist_client.go b/vendor/github.com/grafana/dskit/kv/memberlist/memberlist_client.go index 390eca606f5..d18b6d452cb 100644 --- a/vendor/github.com/grafana/dskit/kv/memberlist/memberlist_client.go +++ b/vendor/github.com/grafana/dskit/kv/memberlist/memberlist_client.go @@ -18,6 +18,7 @@ import ( "github.com/go-kit/log/level" "github.com/hashicorp/memberlist" "github.com/prometheus/client_golang/prometheus" + "go.uber.org/atomic" "github.com/grafana/dskit/backoff" "github.com/grafana/dskit/flagext" @@ -233,9 +234,9 @@ type KV struct { provider DNSProvider // Protects access to memberlist and broadcasts fields. - initWG sync.WaitGroup - memberlist *memberlist.Memberlist - broadcasts *memberlist.TransmitLimitedQueue + delegateReady atomic.Bool + memberlist *memberlist.Memberlist + broadcasts *memberlist.TransmitLimitedQueue // KV Store. storeMu sync.Mutex @@ -451,7 +452,6 @@ func (m *KV) starting(ctx context.Context) error { // // Note: We cannot check for Starting state, as we want to use delegate during cluster joining process // that happens in Starting state. - m.initWG.Add(1) list, err := memberlist.Create(mlCfg) if err != nil { return fmt.Errorf("failed to create memberlist: %v", err) @@ -462,7 +462,7 @@ func (m *KV) starting(ctx context.Context) error { NumNodes: list.NumMembers, RetransmitMult: mlCfg.RetransmitMult, } - m.initWG.Done() + m.delegateReady.Store(true) // Try to fast-join memberlist cluster in Starting state, so that we don't start with empty KV store. if len(m.cfg.JoinMembers) > 0 { @@ -992,6 +992,10 @@ func (m *KV) NodeMeta(limit int) []byte { // NotifyMsg is method from Memberlist Delegate interface // Called when single message is received, i.e. what our broadcastNewValue has sent. func (m *KV) NotifyMsg(msg []byte) { + if !m.delegateReady.Load() { + return + } + m.numberOfReceivedMessages.Inc() m.totalSizeOfReceivedMessages.Add(float64(len(msg))) @@ -1101,7 +1105,9 @@ func (m *KV) queueBroadcast(key string, content []string, version uint, message // GetBroadcasts is method from Memberlist Delegate interface // It returns all pending broadcasts (within the size limit) func (m *KV) GetBroadcasts(overhead, limit int) [][]byte { - m.initWG.Wait() + if !m.delegateReady.Load() { + return nil + } return m.broadcasts.GetBroadcasts(overhead, limit) } @@ -1112,7 +1118,9 @@ func (m *KV) GetBroadcasts(overhead, limit int) [][]byte { // Here we dump our entire state -- all keys and their values. There is no limit on message size here, // as Memberlist uses 'stream' operations for transferring this state. func (m *KV) LocalState(join bool) []byte { - m.initWG.Wait() + if !m.delegateReady.Load() { + return nil + } m.numberOfPulls.Inc() @@ -1184,9 +1192,11 @@ func (m *KV) LocalState(join bool) []byte { // // Data is full state of remote KV store, as generated by LocalState method (run on another node). func (m *KV) MergeRemoteState(data []byte, join bool) { - received := time.Now() + if !m.delegateReady.Load() { + return + } - m.initWG.Wait() + received := time.Now() m.numberOfPushes.Inc() m.totalSizeOfPushes.Add(float64(len(data))) diff --git a/vendor/modules.txt b/vendor/modules.txt index f36c436589a..73a04f38fbe 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -462,7 +462,7 @@ github.com/gosimple/slug # github.com/grafana-tools/sdk v0.0.0-20211220201350-966b3088eec9 ## explicit; go 1.13 github.com/grafana-tools/sdk -# github.com/grafana/dskit v0.0.0-20221213192733-6a4490513ed8 +# github.com/grafana/dskit v0.0.0-20221216094413-a9826f9b0468 ## explicit; go 1.18 github.com/grafana/dskit/backoff github.com/grafana/dskit/cache From b5d87d3210af986d7cdd812c4499b5a61e8b668e Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 16 Dec 2022 11:53:40 +0100 Subject: [PATCH 2/2] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b930520071e..a148b876444 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,7 +17,7 @@ * [BUGFIX] Alertmanager: Fix template spurious deletion with relative data dir. #3604 * [BUGFIX] Security: update prometheus/exporter-toolkit for CVE-2022-46146. #3675 * [BUGFIX] Debian package: Fix post-install, environment file path and user creation. #3720 -* [BUGFIX] memberlist: Fix panic during Mimir startup when Mimir receives gossip message before it's ready. +* [BUGFIX] memberlist: Fix panic during Mimir startup when Mimir receives gossip message before it's ready. #3746 ### Mixin