From ae6dd8677778c7efd630872cf809cf750a4babf4 Mon Sep 17 00:00:00 2001 From: "seer-by-sentry[bot]" <157164994+seer-by-sentry[bot]@users.noreply.github.com> Date: Tue, 11 Nov 2025 18:18:37 +0000 Subject: [PATCH 1/4] fix: Batch commit file change queries to avoid timeouts --- src/sentry/utils/committers.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/sentry/utils/committers.py b/src/sentry/utils/committers.py index afdb025ba46dc6..f9bbe01af2373c 100644 --- a/src/sentry/utils/committers.py +++ b/src/sentry/utils/committers.py @@ -26,6 +26,9 @@ from sentry.utils.hashlib import hash_values PATH_SEPARATORS = frozenset(["/", "\\"]) +# Limit the number of commits to batch in a single query to avoid query timeouts +# from large IN clauses combined with complex LIKE conditions +COMMIT_BATCH_SIZE = 50 def tokenize_path(path: str) -> Iterator[str]: @@ -96,11 +99,19 @@ def _get_commit_file_changes( # build a single query to get all of the commit file that might match the first n frames path_query = reduce(operator.or_, (Q(filename__iendswith=path) for path in filenames)) - commit_file_change_matches = CommitFileChange.objects.filter( - path_query, commit_id__in=[c.id for c in commits] - ) + # Batch commits to avoid query timeouts from large IN clauses + # combined with complex LIKE conditions + all_file_changes: list[CommitFileChange] = [] + commit_ids = [c.id for c in commits] + + for i in range(0, len(commit_ids), COMMIT_BATCH_SIZE): + batch_commit_ids = commit_ids[i:i + COMMIT_BATCH_SIZE] + commit_file_change_matches = CommitFileChange.objects.filter( + path_query, commit_id__in=batch_commit_ids + ) + all_file_changes.extend(list(commit_file_change_matches)) - return list(commit_file_change_matches) + return all_file_changes def _match_commits_paths( From 8b3d8dc951db8b5e012e1cef4bf892aee383dfd8 Mon Sep 17 00:00:00 2001 From: "getsantry[bot]" <66042841+getsantry[bot]@users.noreply.github.com> Date: Tue, 11 Nov 2025 18:19:39 +0000 Subject: [PATCH 2/4] :hammer_and_wrench: apply pre-commit fixes --- src/sentry/utils/committers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sentry/utils/committers.py b/src/sentry/utils/committers.py index f9bbe01af2373c..c6ca2ad40c9dd4 100644 --- a/src/sentry/utils/committers.py +++ b/src/sentry/utils/committers.py @@ -103,9 +103,9 @@ def _get_commit_file_changes( # combined with complex LIKE conditions all_file_changes: list[CommitFileChange] = [] commit_ids = [c.id for c in commits] - + for i in range(0, len(commit_ids), COMMIT_BATCH_SIZE): - batch_commit_ids = commit_ids[i:i + COMMIT_BATCH_SIZE] + batch_commit_ids = commit_ids[i : i + COMMIT_BATCH_SIZE] commit_file_change_matches = CommitFileChange.objects.filter( path_query, commit_id__in=batch_commit_ids ) From a7d17ba16f52da29135c00deba78cf9704b08121 Mon Sep 17 00:00:00 2001 From: Nora Shapiro Date: Tue, 11 Nov 2025 15:40:06 -0800 Subject: [PATCH 3/4] Optimize CommitFileChange query to prevent timeouts --- src/sentry/utils/committers.py | 47 +++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/src/sentry/utils/committers.py b/src/sentry/utils/committers.py index c6ca2ad40c9dd4..cd217de588bb0e 100644 --- a/src/sentry/utils/committers.py +++ b/src/sentry/utils/committers.py @@ -4,11 +4,9 @@ from collections import defaultdict from collections.abc import Iterator, Mapping, MutableMapping, Sequence from enum import Enum -from functools import reduce from typing import Any, TypedDict from django.core.cache import cache -from django.db.models import Q from sentry.api.serializers import serialize from sentry.api.serializers.models.commit import CommitSerializer, get_users_for_commits @@ -24,6 +22,7 @@ from sentry.users.services.user.service import user_service from sentry.utils.event_frames import find_stack_frames, munged_filename_and_frames from sentry.utils.hashlib import hash_values +from sentry.utils.iterators import chunked PATH_SEPARATORS = frozenset(["/", "\\"]) # Limit the number of commits to batch in a single query to avoid query timeouts @@ -90,28 +89,40 @@ def _get_commits(releases: Sequence[Release]) -> Sequence[Commit]: def _get_commit_file_changes( commits: Sequence[Commit], path_name_set: set[str] ) -> Sequence[CommitFileChange]: - # Get distinct file names and bail if there are no files. - filenames = {next(tokenize_path(path), None) for path in path_name_set} - filenames = {path for path in filenames if path is not None} + """ + Find CommitFileChanges matching file paths in path_name_set. + Batches queries and deduplicates results across multiple filename matches. + """ + filenames: set[str] = { + path for path in (next(tokenize_path(p), None) for p in path_name_set) if path is not None + } if not len(filenames): return [] + if not commits: + return [] - # build a single query to get all of the commit file that might match the first n frames - path_query = reduce(operator.or_, (Q(filename__iendswith=path) for path in filenames)) - - # Batch commits to avoid query timeouts from large IN clauses - # combined with complex LIKE conditions - all_file_changes: list[CommitFileChange] = [] commit_ids = [c.id for c in commits] - for i in range(0, len(commit_ids), COMMIT_BATCH_SIZE): - batch_commit_ids = commit_ids[i : i + COMMIT_BATCH_SIZE] - commit_file_change_matches = CommitFileChange.objects.filter( - path_query, commit_id__in=batch_commit_ids - ) - all_file_changes.extend(list(commit_file_change_matches)) + # Collect unique CommitFileChange IDs + matching_ids: set[int] = set() + + # Optimization 1: Batch commit IDs with chunked() to prevent huge IN clauses + for commit_batch in chunked(commit_ids, 100): + # Optimization 2: Split filename queries to eliminate OR conditions + for filename in filenames: + # Optimization 3 (Experimental): separate filter calls to hint optimizer to use indexes first + matches = CommitFileChange.objects.filter(commit_id__in=commit_batch).filter( + filename__iendswith=filename + ) + + # Collect IDs and deduplicate with set operations + matching_ids.update(matches.values_list("id", flat=True)) + + if not matching_ids: + return [] - return all_file_changes + # Single bulk fetch of unique results, ordered by ID + return list(CommitFileChange.objects.filter(id__in=matching_ids).order_by("id")) def _match_commits_paths( From 0081761e81db71adc082765d4f9462d0271be890 Mon Sep 17 00:00:00 2001 From: Nora Shapiro Date: Tue, 11 Nov 2025 15:55:11 -0800 Subject: [PATCH 4/4] no magic numbers --- src/sentry/utils/committers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sentry/utils/committers.py b/src/sentry/utils/committers.py index cd217de588bb0e..1ff7d5347dcada 100644 --- a/src/sentry/utils/committers.py +++ b/src/sentry/utils/committers.py @@ -27,7 +27,7 @@ PATH_SEPARATORS = frozenset(["/", "\\"]) # Limit the number of commits to batch in a single query to avoid query timeouts # from large IN clauses combined with complex LIKE conditions -COMMIT_BATCH_SIZE = 50 +COMMIT_BATCH_SIZE = 100 def tokenize_path(path: str) -> Iterator[str]: @@ -107,7 +107,7 @@ def _get_commit_file_changes( matching_ids: set[int] = set() # Optimization 1: Batch commit IDs with chunked() to prevent huge IN clauses - for commit_batch in chunked(commit_ids, 100): + for commit_batch in chunked(commit_ids, COMMIT_BATCH_SIZE): # Optimization 2: Split filename queries to eliminate OR conditions for filename in filenames: # Optimization 3 (Experimental): separate filter calls to hint optimizer to use indexes first