getsentry · yuvmen · Nov 12, 2025 · Nov 11, 2025 · Nov 11, 2025 · Nov 12, 2025
@@ -32,7 +32,6 @@
 from sentry.snuba.dataset import Dataset
 from sentry.tasks.delete_seer_grouping_records import may_schedule_task_to_delete_hashes_from_seer
 from sentry.utils import metrics
-from sentry.utils.query import RangeQuerySetWrapper
 
 from ..base import BaseDeletionTask, BaseRelation, ModelDeletionTask, ModelRelation
 from ..manager import DeletionTaskManager
@@ -228,8 +227,12 @@ def _delete_children(self, instance_list: Sequence[Group]) -> None:
         # delete_children() will delete GroupHash rows and related GroupHashMetadata rows,
         # however, we have added multiple optimizations in this function that would need to
         # be ported to a custom deletion task.
-        delete_group_hashes(instance_list[0].project_id, error_group_ids, seer_deletion=True)
-        delete_group_hashes(instance_list[0].project_id, issue_platform_group_ids)
+        delete_project_group_hashes(
+            instance_list[0].project_id, group_ids_filter=error_group_ids, seer_deletion=True
+        )
+        delete_project_group_hashes(
+            instance_list[0].project_id, group_ids_filter=issue_platform_group_ids
+        )
 
         # If this isn't a retention cleanup also remove event data.
         if not os.environ.get("_SENTRY_CLEANUP"):
@@ -259,21 +262,6 @@ def mark_deletion_in_progress(self, instance_list: Sequence[Group]) -> None:
         ).update(status=GroupStatus.DELETION_IN_PROGRESS, substatus=None)
 
 
-def delete_project_group_hashes(project_id: int) -> None:
-    groups = []
-    for group in RangeQuerySetWrapper(
-        Group.objects.filter(project_id=project_id), step=GROUP_CHUNK_SIZE
-    ):
-        groups.append(group)
-
-    error_groups, issue_platform_groups = separate_by_group_category(groups)
-    error_group_ids = [group.id for group in error_groups]
-    delete_group_hashes(project_id, error_group_ids, seer_deletion=True)
-
-    issue_platform_group_ids = [group.id for group in issue_platform_groups]
-    delete_group_hashes(project_id, issue_platform_group_ids)
-
-
 def update_group_hash_metadata_in_batches(hash_ids: Sequence[int]) -> None:
     """
     Update seer_matched_grouphash to None for GroupHashMetadata rows
@@ -323,41 +311,52 @@ def update_group_hash_metadata_in_batches(hash_ids: Sequence[int]) -> None:
         metrics.incr("deletions.group_hash_metadata.max_iterations_reached", sample_rate=1.0)
 
 
-def delete_group_hashes(
+def delete_project_group_hashes(
     project_id: int,
-    group_ids: Sequence[int],
+    group_ids_filter: Sequence[int] | None = None,
     seer_deletion: bool = False,
 ) -> None:
-    # Validate batch size to ensure it's at least 1 to avoid ValueError in range()
+    """
+    Delete GroupHash records for a project.
+
+    This is the main function for deleting GroupHash records. It can delete all hashes for a project
+    (used during project deletion to clean up orphaned records) or delete hashes for specific groups
+    (used during group deletion).
+
+    Args:
+        project_id: The project to delete hashes for
+        group_ids_filter: Optional filter for specific group IDs.
+                         - If None: deletes ALL GroupHash records for the project (including orphans)
+                         - If empty: returns immediately (no-op for safety)
+                         - If non-empty: deletes only hashes for those specific groups
+        seer_deletion: Whether to notify Seer about the deletion
+    """
+    # Safety: empty filter means nothing to delete
+    if group_ids_filter is not None and len(group_ids_filter) == 0:
+        return
+
     hashes_batch_size = max(1, options.get("deletions.group-hashes-batch-size"))
 
-    # Set a reasonable upper bound on iterations to prevent infinite loops.
-    # The loop will naturally terminate when no more hashes are found.
     iterations = 0
     while iterations < GROUP_HASH_ITERATIONS:
-        qs = GroupHash.objects.filter(project_id=project_id, group_id__in=group_ids).values_list(
-            "id", "hash"
-        )[:hashes_batch_size]
-        hashes_chunk = list(qs)
+        # Base query: all hashes for this project
+        qs = GroupHash.objects.filter(project_id=project_id)
+
+        # Apply group filter if provided
+        if group_ids_filter is not None:
+            qs = qs.filter(group_id__in=group_ids_filter)
+
+        hashes_chunk = list(qs.values_list("id", "hash")[:hashes_batch_size])
         if not hashes_chunk:
             break
         try:
             if seer_deletion:
-                # Tell seer to delete grouping records for these groups
-                # It's low priority to delete the hashes from seer, so we don't want
-                # any network errors to block the deletion of the groups
                 hash_values = [gh[1] for gh in hashes_chunk]
                 may_schedule_task_to_delete_hashes_from_seer(project_id, hash_values)
         except Exception:
             logger.warning("Error scheduling task to delete hashes from seer")
         finally:
             hash_ids = [gh[0] for gh in hashes_chunk]
-            # GroupHashMetadata rows can reference GroupHash rows via seer_matched_grouphash_id.
-            # Before deleting these GroupHash rows, we need to either:
-            # 1. Update seer_matched_grouphash to None first (to avoid foreign key constraint errors), OR
-            # 2. Delete the GroupHashMetadata rows entirely (they'll be deleted anyway)
-            # If we update the columns first, the deletion of the grouphash metadata rows will have less work to do,
-            # thus, improving the performance of the deletion.
             update_group_hash_metadata_in_batches(hash_ids)
             GroupHashMetadata.objects.filter(grouphash_id__in=hash_ids).delete()
             GroupHash.objects.filter(id__in=hash_ids).delete()
@@ -367,8 +366,8 @@ def delete_group_hashes(
     if iterations == GROUP_HASH_ITERATIONS:
         metrics.incr("deletions.group_hashes.max_iterations_reached", sample_rate=1.0)
         logger.warning(
-            "Group hashes batch deletion reached the maximum number of iterations. "
-            "Investigate if we need to change the GROUP_HASH_ITERATIONS value."
+            "delete_group_hashes.max_iterations_reached",
+            extra={"project_id": project_id, "filtered": group_ids_filter is not None},
         )
 
 

@@ -9,7 +9,10 @@
 from snuba_sdk import Column, Condition, DeleteQuery, Entity, Function, Op, Query, Request
 
 from sentry import deletions, nodestore
-from sentry.deletions.defaults.group import update_group_hash_metadata_in_batches
+from sentry.deletions.defaults.group import (
+    delete_project_group_hashes,
+    update_group_hash_metadata_in_batches,
+)
 from sentry.deletions.tasks.groups import delete_groups_for_project
 from sentry.issues.grouptype import FeedbackGroup, GroupCategory
 from sentry.issues.issue_occurrence import IssueOccurrence
@@ -389,6 +392,68 @@ def test_update_group_hash_metadata_in_batches(self) -> None:
             else:
                 raise AssertionError("GroupHashMetadata is None for grouphash id=%s" % grouphash.id)
 
+    def test_delete_project_group_hashes_specific_groups(self) -> None:
+        """Test deleting grouphashes for specific group IDs (including metadata) and empty list safety."""
+        self.project.update_option("sentry:similarity_backfill_completed", int(time()))
+
+        event_1 = self.store_event(
+            data={"platform": "python", "stacktrace": {"frames": [{"filename": "error_1.py"}]}},
+            project_id=self.project.id,
+        )
+        event_2 = self.store_event(
+            data={"platform": "python", "stacktrace": {"frames": [{"filename": "error_2.py"}]}},
+            project_id=self.project.id,
+        )
+
+        grouphash_1 = GroupHash.objects.get(group=event_1.group)
+        grouphash_2 = GroupHash.objects.get(group=event_2.group)
+        assert grouphash_1.metadata is not None
+        assert grouphash_2.metadata is not None
+        metadata_1_id = grouphash_1.metadata.id
+        metadata_2_id = grouphash_2.metadata.id
+
+        assert GroupHash.objects.filter(project=self.project).count() == 2
+
+        delete_project_group_hashes(
+            project_id=self.project.id,
+            group_ids_filter=[event_1.group.id],
+        )
+
+        assert not GroupHash.objects.filter(id=grouphash_1.id).exists()
+        assert not GroupHashMetadata.objects.filter(id=metadata_1_id).exists()
+        assert GroupHash.objects.filter(id=grouphash_2.id).exists()
+        assert GroupHashMetadata.objects.filter(id=metadata_2_id).exists()
+
+        # Empty list should be a no-op
+        delete_project_group_hashes(project_id=self.project.id, group_ids_filter=[])
+        assert GroupHash.objects.filter(id=grouphash_2.id).exists()
+
+    def test_delete_project_group_hashes_all_including_orphans(self) -> None:
+        """Test deleting all grouphashes including orphans when group_ids_filter=None."""
+        self.project.update_option("sentry:similarity_backfill_completed", int(time()))
+
+        event = self.store_event(
+            data={"platform": "python", "stacktrace": {"frames": [{"filename": "error.py"}]}},
+            project_id=self.project.id,
+        )
+        grouphash = GroupHash.objects.get(group=event.group)
+        assert grouphash.metadata is not None
+        metadata_id = grouphash.metadata.id
+
+        orphan_1 = GroupHash.objects.create(project=self.project, hash="a" * 32, group=None)
+        orphan_2 = GroupHash.objects.create(project=self.project, hash="b" * 32, group=None)
+
+        assert GroupHash.objects.filter(project=self.project).count() == 3
+        assert GroupHash.objects.filter(project=self.project, group__isnull=True).count() == 2
+
+        delete_project_group_hashes(project_id=self.project.id, group_ids_filter=None)
+
+        assert not GroupHash.objects.filter(id=grouphash.id).exists()
+        assert not GroupHash.objects.filter(id=orphan_1.id).exists()
+        assert not GroupHash.objects.filter(id=orphan_2.id).exists()
+        assert not GroupHashMetadata.objects.filter(id=metadata_id).exists()
+        assert GroupHash.objects.filter(project=self.project).count() == 0
+
 
 class DeleteIssuePlatformTest(TestCase, SnubaTestCase, OccurrenceTestMixin):
     referrer = Referrer.TESTING_TEST.value