From 5cefdacff2568eedda0130f3346ebd2707756076 Mon Sep 17 00:00:00 2001 From: Thomas Stromberg Date: Mon, 27 Oct 2025 16:54:52 -0400 Subject: [PATCH] Improve commit->PR lookup reliability --- internal/bot/bot.go | 11 +-- internal/bot/bot_sprinkler.go | 138 +++++++++++++++++++++++++++++++++- internal/bot/polling_test.go | 18 +++-- internal/github/github.go | 74 ++++++++++++++++++ 4 files changed, 222 insertions(+), 19 deletions(-) diff --git a/internal/bot/bot.go b/internal/bot/bot.go index 139384e..d22912e 100644 --- a/internal/bot/bot.go +++ b/internal/bot/bot.go @@ -1132,8 +1132,6 @@ func (c *Coordinator) processPRForChannel( // Resolve channel name to ID for API calls channelID := c.slack.ResolveChannelID(ctx, channelName) - - // Check if channel resolution failed if channelID == channelName || (channelName != "" && channelName[0] == '#' && channelID == channelName[1:]) { slog.Warn("could not resolve channel for PR processing", "workspace", c.workspaceName, @@ -1206,24 +1204,17 @@ func (c *Coordinator) processPRForChannel( // Track that we notified users in this channel for DM delay logic c.notifier.Tracker.UpdateChannelNotification(workspaceID, owner, repo, prNumber) - // Track user tags in channel asynchronously to avoid blocking thread creation - // This is the critical performance optimization - email lookups take 13-20 seconds each - // Extract GitHub usernames who are blocked on this PR + // Track user tags in channel for DM delay logic blockedUsers := c.extractBlockedUsersFromTurnclient(checkResult) domain := c.configManager.Domain(owner) if len(blockedUsers) > 0 { // Record tags for blocked users synchronously to prevent race with DM sending - // This must complete BEFORE DM notifications check tag info - // Note: Most lookups hit cache and are instant; occasional cold lookups may delay slightly - // but this is necessary for correct DM delay logic lookupCtx, lookupCancel := context.WithTimeout(ctx, 5*time.Second) defer lookupCancel() for _, githubUser := range blockedUsers { - // Map GitHub username to Slack user ID slackUserID, err := c.userMapper.SlackHandle(lookupCtx, githubUser, owner, domain) if err == nil && slackUserID != "" { - // Track with channelID - this will only update on FIRST call per user/PR c.notifier.Tracker.UpdateUserPRChannelTag(workspaceID, slackUserID, channelID, owner, repo, prNumber) slog.Debug("tracked user tag in channel", "workspace", workspaceID, diff --git a/internal/bot/bot_sprinkler.go b/internal/bot/bot_sprinkler.go index fa8a754..6dbb869 100644 --- a/internal/bot/bot_sprinkler.go +++ b/internal/bot/bot_sprinkler.go @@ -49,6 +49,88 @@ func eventKey(event client.Event) string { return fmt.Sprintf("%s:%s:%s", event.Timestamp.Format(time.RFC3339Nano), event.URL, event.Type) } +// lookupPRsForCheckEvent looks up PRs for a check event that only has a repo URL. +// This happens when sprinkler's cache doesn't know about the PR yet (timing race). +func (c *Coordinator) lookupPRsForCheckEvent(ctx context.Context, event client.Event, organization string) []int { + commitSHA := "" + deliveryID := "" + if event.Raw != nil { + if sha, ok := event.Raw["commit_sha"].(string); ok { + commitSHA = sha + } + if id, ok := event.Raw["delivery_id"].(string); ok { + deliveryID = id + } + } + + if commitSHA == "" { + slog.Warn("check event without PR number or commit SHA - cannot look up PRs", + "organization", organization, + "url", event.URL, + "type", event.Type, + "delivery_id", deliveryID) + return nil + } + + // Extract owner/repo from URL + parts := strings.Split(event.URL, "/") + if len(parts) < 5 || parts[2] != "github.com" { + slog.Error("could not extract repo from check event URL", + "organization", organization, + "url", event.URL, + "error", "invalid URL format") + return nil + } + owner := parts[3] + repo := parts[4] + + slog.Info("sprinkler cache miss - looking up PRs for commit via GitHub API", + "organization", organization, + "owner", owner, + "repo", repo, + "commit_sha", commitSHA, + "type", event.Type, + "delivery_id", deliveryID) + + // Look up PRs for this commit using GitHub API + // Allow up to 3 minutes for retries (2 min max delay + buffer) + lookupCtx, cancel := context.WithTimeout(ctx, 3*time.Minute) + defer cancel() + + prNumbers, lookupErr := c.github.FindPRsForCommit(lookupCtx, owner, repo, commitSHA) + if lookupErr != nil { + slog.Error("failed to look up PRs for commit", + "organization", organization, + "owner", owner, + "repo", repo, + "commit_sha", commitSHA, + "error", lookupErr, + "impact", "will miss this check event", + "fallback", "5-minute polling will catch it") + return nil + } + + if len(prNumbers) == 0 { + slog.Debug("no open PRs found for commit - likely push to main", + "organization", organization, + "owner", owner, + "repo", repo, + "commit_sha", commitSHA, + "type", event.Type) + return nil + } + + slog.Info("found PRs for commit - processing check event", + "organization", organization, + "owner", owner, + "repo", repo, + "commit_sha", commitSHA, + "pr_count", len(prNumbers), + "pr_numbers", prNumbers) + + return prNumbers +} + // handleSprinklerEvent processes a single event from sprinkler. func (c *Coordinator) handleSprinklerEvent(ctx context.Context, event client.Event, organization string) { // Generate event key using delivery_id if available, otherwise timestamp + URL + type. @@ -130,10 +212,62 @@ func (c *Coordinator) handleSprinklerEvent(ctx context.Context, event client.Eve prNumber, err := parsePRNumberFromURL(event.URL) if err != nil { - slog.Error("failed to parse PR number from URL", + // Sprinkler sends events with just repo URLs when it can't find PRs for a commit + // For check events, try to recover by looking up PRs ourselves + if event.Type == "check_suite" || event.Type == "check_run" { + prNumbers := c.lookupPRsForCheckEvent(ctx, event, organization) + if prNumbers == nil { + return + } + + // Extract owner/repo from URL for constructing PR URLs + parts := strings.Split(event.URL, "/") + owner := parts[3] + repo := parts[4] + + // Process event for each PR found + for _, num := range prNumbers { + msg := SprinklerMessage{ + Type: event.Type, + Event: event.Type, + Repo: owner + "/" + repo, + PRNumber: num, + URL: fmt.Sprintf("https://github.com/%s/%s/pull/%d", owner, repo, num), + Timestamp: event.Timestamp, + } + + // Process with timeout + processCtx, processCancel := context.WithTimeout(ctx, 30*time.Second) + if err := c.processEvent(processCtx, msg); err != nil { + slog.Error("error processing recovered check event", + "organization", organization, + "error", err, + "type", event.Type, + "url", msg.URL, + "repo", msg.Repo, + "pr_number", num) + } + processCancel() + } + return + } + + // Non-check events without PR numbers - just log and ignore + var commitSHA, deliveryID string + if event.Raw != nil { + if sha, ok := event.Raw["commit_sha"].(string); ok { + commitSHA = sha + } + if id, ok := event.Raw["delivery_id"].(string); ok { + deliveryID = id + } + } + slog.Debug("ignoring event without PR number", "organization", organization, "url", event.URL, - "error", err) + "type", event.Type, + "commit_sha", commitSHA, + "delivery_id", deliveryID) return } diff --git a/internal/bot/polling_test.go b/internal/bot/polling_test.go index db8f2cf..dc93a0e 100644 --- a/internal/bot/polling_test.go +++ b/internal/bot/polling_test.go @@ -15,12 +15,12 @@ import ( // See: internal/bot/polling.go:98 - ListClosedPRs(ctx, org, 1) func TestClosedPRPollingWindow(t *testing.T) { tests := []struct { - name string - prClosedAt time.Time - pollingRunsAt time.Time - lookbackHours int - expectPRIncluded bool - scenario string + name string + prClosedAt time.Time + pollingRunsAt time.Time + lookbackHours int + expectPRIncluded bool + scenario string }{ { name: "PR closed 5 minutes ago - should be caught", @@ -210,6 +210,10 @@ func TestClosedPRRecoveryScenarios(t *testing.T) { // parseTime is a helper to create times on today's date for testing. func parseTime(hhMM string) time.Time { now := time.Now() - parsed, _ := time.Parse("15:04", hhMM) + parsed, err := time.Parse("15:04", hhMM) + if err != nil { + // This should never happen in tests with valid time strings + panic("parseTime: invalid time format " + hhMM + ": " + err.Error()) + } return time.Date(now.Year(), now.Month(), now.Day(), parsed.Hour(), parsed.Minute(), 0, 0, now.Location()) } diff --git a/internal/github/github.go b/internal/github/github.go index f827f9d..d0ec809 100644 --- a/internal/github/github.go +++ b/internal/github/github.go @@ -586,6 +586,80 @@ type PRInfo struct { Number int } +// FindPRsForCommit finds all open PRs associated with a commit SHA. +func (c *Client) FindPRsForCommit(ctx context.Context, owner, repo, sha string) ([]int, error) { + if owner == "" || repo == "" || sha == "" { + return nil, fmt.Errorf("invalid parameters: owner=%q, repo=%q, sha=%q", owner, repo, sha) + } + + slog.Debug("looking up PRs for commit", + "owner", owner, + "repo", repo, + "sha", sha) + + var allPRs []*github.PullRequest + err := retry.Do( + func() error { + var resp *github.Response + var err error + allPRs, resp, err = c.client.PullRequests.ListPullRequestsWithCommit( + ctx, + owner, + repo, + sha, + &github.PullRequestListOptions{ + State: "all", // Include open, closed, and merged + }, + ) + if err != nil { + if resp != nil && resp.StatusCode == http.StatusNotFound { + // Commit doesn't exist or no PRs found - don't retry + return retry.Unrecoverable(err) + } + slog.Warn("failed to list PRs for commit, retrying", + "owner", owner, + "repo", repo, + "sha", sha, + "error", err) + return err + } + return nil + }, + retry.Attempts(5), + retry.Delay(time.Second), + retry.MaxDelay(2*time.Minute), + retry.DelayType(retry.BackOffDelay), + retry.MaxJitter(time.Second), + retry.LastErrorOnly(true), + retry.Context(ctx), + ) + if err != nil { + slog.Debug("no PRs found for commit", + "owner", owner, + "repo", repo, + "sha", sha, + "error", err) + return []int{}, nil // Return empty list, not error + } + + // Extract PR numbers from open PRs only + var prNumbers []int + for _, pr := range allPRs { + if pr.GetState() == "open" { + prNumbers = append(prNumbers, pr.GetNumber()) + } + } + + slog.Debug("found PRs for commit", + "owner", owner, + "repo", repo, + "sha", sha, + "pr_count", len(prNumbers), + "pr_numbers", prNumbers) + + return prNumbers, nil +} + // Client returns the underlying GitHub client. func (c *Client) Client() *github.Client { return c.client