Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 33 additions & 11 deletions src/sentry/utils/committers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,9 @@
from collections import defaultdict
from collections.abc import Iterator, Mapping, MutableMapping, Sequence
from enum import Enum
from functools import reduce
from typing import Any, TypedDict

from django.core.cache import cache
from django.db.models import Q

from sentry.api.serializers import serialize
from sentry.api.serializers.models.commit import CommitSerializer, get_users_for_commits
Expand All @@ -24,8 +22,12 @@
from sentry.users.services.user.service import user_service
from sentry.utils.event_frames import find_stack_frames, munged_filename_and_frames
from sentry.utils.hashlib import hash_values
from sentry.utils.iterators import chunked

PATH_SEPARATORS = frozenset(["/", "\\"])
# Limit the number of commits to batch in a single query to avoid query timeouts
# from large IN clauses combined with complex LIKE conditions
COMMIT_BATCH_SIZE = 100


def tokenize_path(path: str) -> Iterator[str]:
Expand Down Expand Up @@ -87,20 +89,40 @@ def _get_commits(releases: Sequence[Release]) -> Sequence[Commit]:
def _get_commit_file_changes(
commits: Sequence[Commit], path_name_set: set[str]
) -> Sequence[CommitFileChange]:
# Get distinct file names and bail if there are no files.
filenames = {next(tokenize_path(path), None) for path in path_name_set}
filenames = {path for path in filenames if path is not None}
"""
Find CommitFileChanges matching file paths in path_name_set.
Batches queries and deduplicates results across multiple filename matches.
"""
filenames: set[str] = {
path for path in (next(tokenize_path(p), None) for p in path_name_set) if path is not None
}
if not len(filenames):
return []
if not commits:
return []

# build a single query to get all of the commit file that might match the first n frames
path_query = reduce(operator.or_, (Q(filename__iendswith=path) for path in filenames))
commit_ids = [c.id for c in commits]

commit_file_change_matches = CommitFileChange.objects.filter(
path_query, commit_id__in=[c.id for c in commits]
)
# Collect unique CommitFileChange IDs
matching_ids: set[int] = set()

# Optimization 1: Batch commit IDs with chunked() to prevent huge IN clauses
for commit_batch in chunked(commit_ids, COMMIT_BATCH_SIZE):
# Optimization 2: Split filename queries to eliminate OR conditions
for filename in filenames:
# Optimization 3 (Experimental): separate filter calls to hint optimizer to use indexes first
matches = CommitFileChange.objects.filter(commit_id__in=commit_batch).filter(
filename__iendswith=filename
)

# Collect IDs and deduplicate with set operations
matching_ids.update(matches.values_list("id", flat=True))

if not matching_ids:
return []

return list(commit_file_change_matches)
# Single bulk fetch of unique results, ordered by ID
return list(CommitFileChange.objects.filter(id__in=matching_ids).order_by("id"))


def _match_commits_paths(
Expand Down
Loading