From 8f0fcd6e569ecd394d1dfa0acffe86d72d91e98a Mon Sep 17 00:00:00 2001 From: Thomas Stromberg Date: Mon, 20 Oct 2025 19:20:55 +0200 Subject: [PATCH] Reliability improvements - logging, races --- cmd/server/main.go | 2 +- go.mod | 10 ++--- go.sum | 20 +++++----- internal/bot/bot_sprinkler.go | 29 +++++++++++--- internal/bot/polling.go | 7 ++-- internal/config/config.go | 71 +++++++++++++++++------------------ pkg/home/fetcher.go | 4 +- 7 files changed, 80 insertions(+), 63 deletions(-) diff --git a/cmd/server/main.go b/cmd/server/main.go index 66b2244..0504703 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -145,7 +145,7 @@ func run(ctx context.Context, cancel context.CancelFunc, cfg *config.ServerConfi "startup_message", "Starting Slacker server...") // Initialize config manager for repo configs. - configManager := config.New(ctx) + configManager := config.New() // Initialize GitHub installation manager. githubManager, err := github.NewManager(ctx, cfg.GitHubAppID, cfg.GitHubPrivateKey, cfg.AllowPersonalAccounts) diff --git a/go.mod b/go.mod index fb52c78..32773d2 100644 --- a/go.mod +++ b/go.mod @@ -3,11 +3,11 @@ module github.com/codeGROOVE-dev/slacker go 1.25.1 require ( - cloud.google.com/go/datastore v1.20.0 + cloud.google.com/go/datastore v1.21.0 github.com/codeGROOVE-dev/gh-mailto v0.0.0-20251019162917-c3412c017b1f github.com/codeGROOVE-dev/gsm v0.0.0-20251019065141-833fe2363d22 github.com/codeGROOVE-dev/retry v1.2.0 - github.com/codeGROOVE-dev/sprinkler v0.0.0-20251020140418-efb533e2ff51 + github.com/codeGROOVE-dev/sprinkler v0.0.0-20251020171924-1aac68f58e14 github.com/codeGROOVE-dev/turnclient v0.0.0-20251018202306-7cdc0d51856e github.com/golang-jwt/jwt/v5 v5.3.0 github.com/google/go-github/v50 v50.2.0 @@ -48,9 +48,9 @@ require ( golang.org/x/sys v0.37.0 // indirect golang.org/x/text v0.30.0 // indirect google.golang.org/api v0.252.0 // indirect - google.golang.org/genproto v0.0.0-20251014184007-4626949a642f // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20251014184007-4626949a642f // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20251014184007-4626949a642f // indirect + google.golang.org/genproto v0.0.0-20251020155222-88f65dc88635 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20251020155222-88f65dc88635 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20251020155222-88f65dc88635 // indirect google.golang.org/grpc v1.76.0 // indirect google.golang.org/protobuf v1.36.10 // indirect ) diff --git a/go.sum b/go.sum index 1a79fc5..bab174e 100644 --- a/go.sum +++ b/go.sum @@ -6,8 +6,8 @@ cloud.google.com/go/auth/oauth2adapt v0.2.8 h1:keo8NaayQZ6wimpNSmW5OPc283g65QNIi cloud.google.com/go/auth/oauth2adapt v0.2.8/go.mod h1:XQ9y31RkqZCcwJWNSx2Xvric3RrU88hAYYbjDWYDL+c= cloud.google.com/go/compute/metadata v0.9.0 h1:pDUj4QMoPejqq20dK0Pg2N4yG9zIkYGdBtwLoEkH9Zs= cloud.google.com/go/compute/metadata v0.9.0/go.mod h1:E0bWwX5wTnLPedCKqk3pJmVgCBSM6qQI1yTBdEb3C10= -cloud.google.com/go/datastore v1.20.0 h1:NNpXoyEqIJmZFc0ACcwBEaXnmscUpcG4NkKnbCePmiM= -cloud.google.com/go/datastore v1.20.0/go.mod h1:uFo3e+aEpRfHgtp5pp0+6M0o147KoPaYNaPAKpfh8Ew= +cloud.google.com/go/datastore v1.21.0 h1:dUrYq47ysCA4nM7u8kRT0WnbfXc6TzX49cP3TCwIiA0= +cloud.google.com/go/datastore v1.21.0/go.mod h1:9l+KyAHO+YVVcdBbNQZJu8svF17Nw5sMKuFR0LYf1nY= github.com/ProtonMail/go-crypto v1.3.0 h1:ILq8+Sf5If5DCpHQp4PbZdS1J7HDFRXz/+xKBiRGFrw= github.com/ProtonMail/go-crypto v1.3.0/go.mod h1:9whxjD8Rbs29b4XWbB8irEcE8KHMqaR2e7GWU1R+/PE= github.com/cloudflare/circl v1.6.1 h1:zqIqSPIndyBh1bjLVVDHMPpVKqp8Su/V+6MeDzzQBQ0= @@ -22,8 +22,8 @@ github.com/codeGROOVE-dev/prx v0.0.0-20251016165946-00c6c6e90c29 h1:MSBy3Ywr3ky/ github.com/codeGROOVE-dev/prx v0.0.0-20251016165946-00c6c6e90c29/go.mod h1:7qLbi18baOyS8yO/6/64SBIqtyzSzLFdsDST15NPH3w= github.com/codeGROOVE-dev/retry v1.2.0 h1:xYpYPX2PQZmdHwuiQAGGzsBm392xIMl4nfMEFApQnu8= github.com/codeGROOVE-dev/retry v1.2.0/go.mod h1:8OgefgV1XP7lzX2PdKlCXILsYKuz6b4ZpHa/20iLi8E= -github.com/codeGROOVE-dev/sprinkler v0.0.0-20251020140418-efb533e2ff51 h1:oPVbUoZ1jxgmrqybgRCfhwdT8KaXE/hzQ4vAswRybt0= -github.com/codeGROOVE-dev/sprinkler v0.0.0-20251020140418-efb533e2ff51/go.mod h1:/kd3ncsRNldD0MUpbtp5ojIzfCkyeXB7JdOrpuqG7Gg= +github.com/codeGROOVE-dev/sprinkler v0.0.0-20251020171924-1aac68f58e14 h1:AKcULaDrbhKDkf6vpWGo36iyLoiOVhLu1MFcnNmDbWg= +github.com/codeGROOVE-dev/sprinkler v0.0.0-20251020171924-1aac68f58e14/go.mod h1:/kd3ncsRNldD0MUpbtp5ojIzfCkyeXB7JdOrpuqG7Gg= github.com/codeGROOVE-dev/turnclient v0.0.0-20251018202306-7cdc0d51856e h1:3qoY6h8SgoeNsIYRM7P6PegTXAHPo8OSOapUunVP/Gs= github.com/codeGROOVE-dev/turnclient v0.0.0-20251018202306-7cdc0d51856e/go.mod h1:fYwtN9Ql6lY8t2WvCfENx+mP5FUwjlqwXCLx9CVLY20= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= @@ -118,12 +118,12 @@ gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= google.golang.org/api v0.252.0 h1:xfKJeAJaMwb8OC9fesr369rjciQ704AjU/psjkKURSI= google.golang.org/api v0.252.0/go.mod h1:dnHOv81x5RAmumZ7BWLShB/u7JZNeyalImxHmtTHxqw= -google.golang.org/genproto v0.0.0-20251014184007-4626949a642f h1:vLd1CJuJOUgV6qijD7KT5Y2ZtC97ll4dxjTUappMnbo= -google.golang.org/genproto v0.0.0-20251014184007-4626949a642f/go.mod h1:PI3KrSadr00yqfv6UDvgZGFsmLqeRIwt8x4p5Oo7CdM= -google.golang.org/genproto/googleapis/api v0.0.0-20251014184007-4626949a642f h1:OiFuztEyBivVKDvguQJYWq1yDcfAHIID/FVrPR4oiI0= -google.golang.org/genproto/googleapis/api v0.0.0-20251014184007-4626949a642f/go.mod h1:kprOiu9Tr0JYyD6DORrc4Hfyk3RFXqkQ3ctHEum3ZbM= -google.golang.org/genproto/googleapis/rpc v0.0.0-20251014184007-4626949a642f h1:1FTH6cpXFsENbPR5Bu8NQddPSaUUE6NA2XdZdDSAJK4= -google.golang.org/genproto/googleapis/rpc v0.0.0-20251014184007-4626949a642f/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk= +google.golang.org/genproto v0.0.0-20251020155222-88f65dc88635 h1:I5FLgnlmGA5voD3BZp9Rc17FGiius/DlMB3WsJ1C4Xw= +google.golang.org/genproto v0.0.0-20251020155222-88f65dc88635/go.mod h1:1Ic78BnpzY8OaTCmzxJDP4qC9INZPbGZl+54RKjtyeI= +google.golang.org/genproto/googleapis/api v0.0.0-20251020155222-88f65dc88635 h1:1wvBeYv+A2zfEbxROscJl69OP0m74S8wGEO+Syat26o= +google.golang.org/genproto/googleapis/api v0.0.0-20251020155222-88f65dc88635/go.mod h1:fDMmzKV90WSg1NbozdqrE64fkuTv6mlq2zxo9ad+3yo= +google.golang.org/genproto/googleapis/rpc v0.0.0-20251020155222-88f65dc88635 h1:3uycTxukehWrxH4HtPRtn1PDABTU331ViDjyqrUbaog= +google.golang.org/genproto/googleapis/rpc v0.0.0-20251020155222-88f65dc88635/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk= google.golang.org/grpc v1.76.0 h1:UnVkv1+uMLYXoIz6o7chp59WfQUYA2ex/BXQ9rHZu7A= google.golang.org/grpc v1.76.0/go.mod h1:Ju12QI8M6iQJtbcsV+awF5a4hfJMLi4X0JLo94ULZ6c= google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE= diff --git a/internal/bot/bot_sprinkler.go b/internal/bot/bot_sprinkler.go index 6b06179..e314132 100644 --- a/internal/bot/bot_sprinkler.go +++ b/internal/bot/bot_sprinkler.go @@ -278,19 +278,26 @@ func (c *Coordinator) RunWithSprinklerClient(ctx context.Context) error { NoReconnect: false, // Enable automatic reconnection PingInterval: 0, // Use default (30 seconds) OnConnect: func() { - slog.Info("sprinkler client connected", + slog.Warn("🟢 SPRINKLER CONNECTED", "organization", organization, - "url", c.sprinklerURL) + "url", c.sprinklerURL, + "subscribed_events", "*", + "critical", "now receiving real-time webhook events") }, OnDisconnect: func(err error) { if err != nil { - slog.Error("sprinkler client disconnected", + slog.Error("🔴 SPRINKLER DISCONNECTED - WILL MISS EVENTS UNTIL RECONNECTED", "organization", organization, - "error", err) + "error", err, + "impact", "real-time webhook events will be missed", + "fallback", "5-minute polling will catch missed events", + "action_required", "investigate why connection dropped") return } - slog.Info("sprinkler client disconnected normally", - "organization", organization) + slog.Warn("🟡 SPRINKLER DISCONNECTED (graceful)", + "organization", organization, + "reason", "clean shutdown or reconnection attempt", + "impact", "may miss events during reconnection window") }, OnEvent: func(event client.Event) { // SECURITY NOTE: Use detached context for event processing to prevent webhook @@ -319,6 +326,10 @@ func (c *Coordinator) RunWithSprinklerClient(ctx context.Context) error { cleanupTicker := time.NewTicker(6 * time.Hour) defer cleanupTicker.Stop() + // Connection health monitoring - log every minute to detect silent disconnections + healthTicker := time.NewTicker(1 * time.Minute) + defer healthTicker.Stop() + go func() { for { select { @@ -327,6 +338,12 @@ func (c *Coordinator) RunWithSprinklerClient(ctx context.Context) error { case <-cleanupTicker.C: c.threadCache.Cleanup(30 * 24 * time.Hour) slog.Debug("cleaned up old thread cache entries", "organization", organization) + case <-healthTicker.C: + // Log connection health every minute + // This helps detect if we're silently disconnected for long periods + slog.Debug("sprinkler connection health check", + "organization", organization, + "note", "if you see this log but no events for >2min during active PR work, connection may be broken") } } }() diff --git a/internal/bot/polling.go b/internal/bot/polling.go index 6ca6e27..3bd225c 100644 --- a/internal/bot/polling.go +++ b/internal/bot/polling.go @@ -412,11 +412,12 @@ func (c *Coordinator) StartupReconciliation(ctx context.Context) { // Determine if we should notify var reason string - if lastNotified.IsZero() { + switch { + case lastNotified.IsZero(): reason = "never_notified" - } else if pr.UpdatedAt.After(lastNotified) { + case pr.UpdatedAt.After(lastNotified): reason = "updated_since_last_notification" - } else { + default: skippedCount++ slog.Debug("skipping PR - already notified and not updated", "pr", fmt.Sprintf("%s/%s#%d", pr.Owner, pr.Repo, pr.Number), diff --git a/internal/config/config.go b/internal/config/config.go index dddb4c6..ceb3cde 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -9,6 +9,7 @@ import ( "net/http" "strings" "sync" + "sync/atomic" "time" "github.com/codeGROOVE-dev/retry" @@ -59,12 +60,13 @@ type configCacheEntry struct { } // configCache manages configuration caching with TTL and thread safety. +// Statistics counters use atomic operations to avoid races during concurrent reads. type configCache struct { entries map[string]configCacheEntry ttl time.Duration mu sync.RWMutex - hits int64 - misses int64 + hits atomic.Int64 // Atomic to avoid race during concurrent get() calls + misses atomic.Int64 // Atomic to avoid race during concurrent get() calls } // get retrieves a cached configuration if it exists and is not expired. @@ -74,16 +76,16 @@ func (c *configCache) get(org string) (*RepoConfig, bool) { entry, exists := c.entries[org] if !exists { - c.misses++ + c.misses.Add(1) return nil, false } if time.Since(entry.timestamp) > c.ttl { - c.misses++ + c.misses.Add(1) return nil, false } - c.hits++ + c.hits.Add(1) return entry.config, true } @@ -117,10 +119,9 @@ func (c *configCache) invalidateAll() { } // stats returns cache statistics. +// No lock needed - atomic reads are inherently thread-safe. func (c *configCache) stats() (hits, misses int64) { - c.mu.RLock() - defer c.mu.RUnlock() - return c.hits, c.misses + return c.hits.Load(), c.misses.Load() } // Manager manages repository configurations. @@ -133,7 +134,7 @@ type Manager struct { } // New creates a new config manager. -func New(ctx context.Context) *Manager { +func New() *Manager { return &Manager{ configs: make(map[string]*RepoConfig), clients: make(map[string]*github.Client), @@ -183,7 +184,7 @@ func createDefaultConfig() *RepoConfig { // LoadConfig loads the configuration for a GitHub org with retry logic. func (m *Manager) LoadConfig(ctx context.Context, org string) error { // Check cache first - if cachedConfig, found := m.cache.get(org); found { + if cfg, found := m.cache.get(org); found { hits, misses := m.cache.stats() slog.Debug("using cached config for organization", logFieldOrg, org, @@ -192,7 +193,7 @@ func (m *Manager) LoadConfig(ctx context.Context, org string) error { "cache_hit_ratio", float64(hits)/float64(hits+misses)) m.mu.Lock() - m.configs[org] = cachedConfig + m.configs[org] = cfg m.mu.Unlock() return nil } @@ -284,9 +285,9 @@ func (m *Manager) LoadConfig(ctx context.Context, org string) error { ) if err != nil { // Use default empty config if not found - defaultConfig := createDefaultConfig() - m.configs[org] = defaultConfig - m.cache.set(org, defaultConfig) + cfg := createDefaultConfig() + m.configs[org] = cfg + m.cache.set(org, cfg) hits, misses := m.cache.stats() slog.Info("using default configuration for org", @@ -307,9 +308,9 @@ func (m *Manager) LoadConfig(ctx context.Context, org string) error { var config RepoConfig if err := yaml.Unmarshal([]byte(configContent), &config); err != nil { - defaultConfig := createDefaultConfig() - m.configs[org] = defaultConfig - m.cache.set(org, defaultConfig) + cfg := createDefaultConfig() + m.configs[org] = cfg + m.cache.set(org, cfg) hits, misses := m.cache.stats() slog.Error("failed to parse YAML configuration - using defaults", @@ -331,31 +332,29 @@ func (m *Manager) LoadConfig(ctx context.Context, org string) error { "email_domain", config.Global.EmailDomain) // Count channel configurations - mutedChannels := 0 - totalRepos := 0 - wildcardChannels := 0 - for channelName, channelConfig := range config.Channels { - if channelConfig.Mute { - mutedChannels++ + var muted, repos, wildcard int + for name, ch := range config.Channels { + if ch.Mute { + muted++ } - totalRepos += len(channelConfig.Repos) + repos += len(ch.Repos) - hasWildcard := false - for _, repo := range channelConfig.Repos { + hasWild := false + for _, repo := range ch.Repos { if repo == "*" { - wildcardChannels++ - hasWildcard = true + wildcard++ + hasWild = true break } } slog.Debug("channel configuration loaded", logFieldOrg, org, - "channel", channelName, - "repos_count", len(channelConfig.Repos), - "repos", channelConfig.Repos, - "muted", channelConfig.Mute, - "has_wildcard", hasWildcard) + "channel", name, + "repos_count", len(ch.Repos), + "repos", ch.Repos, + "muted", ch.Mute, + "has_wildcard", hasWild) } m.configs[org] = &config @@ -371,9 +370,9 @@ func (m *Manager) LoadConfig(ctx context.Context, org string) error { "email_domain": config.Global.EmailDomain, "daily_reminders": config.Global.DailyReminders, "total_channels": len(config.Channels), - "muted_channels": mutedChannels, - "wildcard_channels": wildcardChannels, - "total_repo_mappings": totalRepos, + "muted_channels": muted, + "wildcard_channels": wildcard, + "total_repo_mappings": repos, }, "cached", true, "cache_hits", hits, diff --git a/pkg/home/fetcher.go b/pkg/home/fetcher.go index 06e89d5..79deb3e 100644 --- a/pkg/home/fetcher.go +++ b/pkg/home/fetcher.go @@ -232,9 +232,9 @@ func (f *Fetcher) enrichPRs(ctx context.Context, prs []PR, githubUsername string checkResult, err = turnClient.Check(ctx, pr.URL, f.botUsername, pr.LastEventTime) return err }, - retry.Attempts(3), // Fewer attempts for per-PR enrichment + retry.Attempts(5), retry.Delay(500*time.Millisecond), - retry.MaxDelay(30*time.Second), + retry.MaxDelay(2*time.Minute), retry.DelayType(retry.BackOffDelay), retry.MaxJitter(time.Second), retry.Context(ctx),