diff --git a/src/sentry/deletions/defaults/group.py b/src/sentry/deletions/defaults/group.py index a815040177011b..0d21f1fd1e9bc7 100644 --- a/src/sentry/deletions/defaults/group.py +++ b/src/sentry/deletions/defaults/group.py @@ -32,7 +32,6 @@ from sentry.snuba.dataset import Dataset from sentry.tasks.delete_seer_grouping_records import may_schedule_task_to_delete_hashes_from_seer from sentry.utils import metrics -from sentry.utils.query import RangeQuerySetWrapper from ..base import BaseDeletionTask, BaseRelation, ModelDeletionTask, ModelRelation from ..manager import DeletionTaskManager @@ -228,8 +227,12 @@ def _delete_children(self, instance_list: Sequence[Group]) -> None: # delete_children() will delete GroupHash rows and related GroupHashMetadata rows, # however, we have added multiple optimizations in this function that would need to # be ported to a custom deletion task. - delete_group_hashes(instance_list[0].project_id, error_group_ids, seer_deletion=True) - delete_group_hashes(instance_list[0].project_id, issue_platform_group_ids) + delete_project_group_hashes( + instance_list[0].project_id, group_ids_filter=error_group_ids, seer_deletion=True + ) + delete_project_group_hashes( + instance_list[0].project_id, group_ids_filter=issue_platform_group_ids + ) # If this isn't a retention cleanup also remove event data. if not os.environ.get("_SENTRY_CLEANUP"): @@ -259,21 +262,6 @@ def mark_deletion_in_progress(self, instance_list: Sequence[Group]) -> None: ).update(status=GroupStatus.DELETION_IN_PROGRESS, substatus=None) -def delete_project_group_hashes(project_id: int) -> None: - groups = [] - for group in RangeQuerySetWrapper( - Group.objects.filter(project_id=project_id), step=GROUP_CHUNK_SIZE - ): - groups.append(group) - - error_groups, issue_platform_groups = separate_by_group_category(groups) - error_group_ids = [group.id for group in error_groups] - delete_group_hashes(project_id, error_group_ids, seer_deletion=True) - - issue_platform_group_ids = [group.id for group in issue_platform_groups] - delete_group_hashes(project_id, issue_platform_group_ids) - - def update_group_hash_metadata_in_batches(hash_ids: Sequence[int]) -> None: """ Update seer_matched_grouphash to None for GroupHashMetadata rows @@ -323,41 +311,52 @@ def update_group_hash_metadata_in_batches(hash_ids: Sequence[int]) -> None: metrics.incr("deletions.group_hash_metadata.max_iterations_reached", sample_rate=1.0) -def delete_group_hashes( +def delete_project_group_hashes( project_id: int, - group_ids: Sequence[int], + group_ids_filter: Sequence[int] | None = None, seer_deletion: bool = False, ) -> None: - # Validate batch size to ensure it's at least 1 to avoid ValueError in range() + """ + Delete GroupHash records for a project. + + This is the main function for deleting GroupHash records. It can delete all hashes for a project + (used during project deletion to clean up orphaned records) or delete hashes for specific groups + (used during group deletion). + + Args: + project_id: The project to delete hashes for + group_ids_filter: Optional filter for specific group IDs. + - If None: deletes ALL GroupHash records for the project (including orphans) + - If empty: returns immediately (no-op for safety) + - If non-empty: deletes only hashes for those specific groups + seer_deletion: Whether to notify Seer about the deletion + """ + # Safety: empty filter means nothing to delete + if group_ids_filter is not None and len(group_ids_filter) == 0: + return + hashes_batch_size = max(1, options.get("deletions.group-hashes-batch-size")) - # Set a reasonable upper bound on iterations to prevent infinite loops. - # The loop will naturally terminate when no more hashes are found. iterations = 0 while iterations < GROUP_HASH_ITERATIONS: - qs = GroupHash.objects.filter(project_id=project_id, group_id__in=group_ids).values_list( - "id", "hash" - )[:hashes_batch_size] - hashes_chunk = list(qs) + # Base query: all hashes for this project + qs = GroupHash.objects.filter(project_id=project_id) + + # Apply group filter if provided + if group_ids_filter is not None: + qs = qs.filter(group_id__in=group_ids_filter) + + hashes_chunk = list(qs.values_list("id", "hash")[:hashes_batch_size]) if not hashes_chunk: break try: if seer_deletion: - # Tell seer to delete grouping records for these groups - # It's low priority to delete the hashes from seer, so we don't want - # any network errors to block the deletion of the groups hash_values = [gh[1] for gh in hashes_chunk] may_schedule_task_to_delete_hashes_from_seer(project_id, hash_values) except Exception: logger.warning("Error scheduling task to delete hashes from seer") finally: hash_ids = [gh[0] for gh in hashes_chunk] - # GroupHashMetadata rows can reference GroupHash rows via seer_matched_grouphash_id. - # Before deleting these GroupHash rows, we need to either: - # 1. Update seer_matched_grouphash to None first (to avoid foreign key constraint errors), OR - # 2. Delete the GroupHashMetadata rows entirely (they'll be deleted anyway) - # If we update the columns first, the deletion of the grouphash metadata rows will have less work to do, - # thus, improving the performance of the deletion. update_group_hash_metadata_in_batches(hash_ids) GroupHashMetadata.objects.filter(grouphash_id__in=hash_ids).delete() GroupHash.objects.filter(id__in=hash_ids).delete() @@ -367,8 +366,8 @@ def delete_group_hashes( if iterations == GROUP_HASH_ITERATIONS: metrics.incr("deletions.group_hashes.max_iterations_reached", sample_rate=1.0) logger.warning( - "Group hashes batch deletion reached the maximum number of iterations. " - "Investigate if we need to change the GROUP_HASH_ITERATIONS value." + "delete_group_hashes.max_iterations_reached", + extra={"project_id": project_id, "filtered": group_ids_filter is not None}, ) diff --git a/tests/sentry/deletions/test_group.py b/tests/sentry/deletions/test_group.py index 530fcacf0ae0f7..5cf32fb398a4da 100644 --- a/tests/sentry/deletions/test_group.py +++ b/tests/sentry/deletions/test_group.py @@ -9,7 +9,10 @@ from snuba_sdk import Column, Condition, DeleteQuery, Entity, Function, Op, Query, Request from sentry import deletions, nodestore -from sentry.deletions.defaults.group import update_group_hash_metadata_in_batches +from sentry.deletions.defaults.group import ( + delete_project_group_hashes, + update_group_hash_metadata_in_batches, +) from sentry.deletions.tasks.groups import delete_groups_for_project from sentry.issues.grouptype import FeedbackGroup, GroupCategory from sentry.issues.issue_occurrence import IssueOccurrence @@ -389,6 +392,68 @@ def test_update_group_hash_metadata_in_batches(self) -> None: else: raise AssertionError("GroupHashMetadata is None for grouphash id=%s" % grouphash.id) + def test_delete_project_group_hashes_specific_groups(self) -> None: + """Test deleting grouphashes for specific group IDs (including metadata) and empty list safety.""" + self.project.update_option("sentry:similarity_backfill_completed", int(time())) + + event_1 = self.store_event( + data={"platform": "python", "stacktrace": {"frames": [{"filename": "error_1.py"}]}}, + project_id=self.project.id, + ) + event_2 = self.store_event( + data={"platform": "python", "stacktrace": {"frames": [{"filename": "error_2.py"}]}}, + project_id=self.project.id, + ) + + grouphash_1 = GroupHash.objects.get(group=event_1.group) + grouphash_2 = GroupHash.objects.get(group=event_2.group) + assert grouphash_1.metadata is not None + assert grouphash_2.metadata is not None + metadata_1_id = grouphash_1.metadata.id + metadata_2_id = grouphash_2.metadata.id + + assert GroupHash.objects.filter(project=self.project).count() == 2 + + delete_project_group_hashes( + project_id=self.project.id, + group_ids_filter=[event_1.group.id], + ) + + assert not GroupHash.objects.filter(id=grouphash_1.id).exists() + assert not GroupHashMetadata.objects.filter(id=metadata_1_id).exists() + assert GroupHash.objects.filter(id=grouphash_2.id).exists() + assert GroupHashMetadata.objects.filter(id=metadata_2_id).exists() + + # Empty list should be a no-op + delete_project_group_hashes(project_id=self.project.id, group_ids_filter=[]) + assert GroupHash.objects.filter(id=grouphash_2.id).exists() + + def test_delete_project_group_hashes_all_including_orphans(self) -> None: + """Test deleting all grouphashes including orphans when group_ids_filter=None.""" + self.project.update_option("sentry:similarity_backfill_completed", int(time())) + + event = self.store_event( + data={"platform": "python", "stacktrace": {"frames": [{"filename": "error.py"}]}}, + project_id=self.project.id, + ) + grouphash = GroupHash.objects.get(group=event.group) + assert grouphash.metadata is not None + metadata_id = grouphash.metadata.id + + orphan_1 = GroupHash.objects.create(project=self.project, hash="a" * 32, group=None) + orphan_2 = GroupHash.objects.create(project=self.project, hash="b" * 32, group=None) + + assert GroupHash.objects.filter(project=self.project).count() == 3 + assert GroupHash.objects.filter(project=self.project, group__isnull=True).count() == 2 + + delete_project_group_hashes(project_id=self.project.id, group_ids_filter=None) + + assert not GroupHash.objects.filter(id=grouphash.id).exists() + assert not GroupHash.objects.filter(id=orphan_1.id).exists() + assert not GroupHash.objects.filter(id=orphan_2.id).exists() + assert not GroupHashMetadata.objects.filter(id=metadata_id).exists() + assert GroupHash.objects.filter(project=self.project).count() == 0 + class DeleteIssuePlatformTest(TestCase, SnubaTestCase, OccurrenceTestMixin): referrer = Referrer.TESTING_TEST.value