Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 37 additions & 38 deletions src/sentry/deletions/defaults/group.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
from sentry.snuba.dataset import Dataset
from sentry.tasks.delete_seer_grouping_records import may_schedule_task_to_delete_hashes_from_seer
from sentry.utils import metrics
from sentry.utils.query import RangeQuerySetWrapper

from ..base import BaseDeletionTask, BaseRelation, ModelDeletionTask, ModelRelation
from ..manager import DeletionTaskManager
Expand Down Expand Up @@ -228,8 +227,12 @@ def _delete_children(self, instance_list: Sequence[Group]) -> None:
# delete_children() will delete GroupHash rows and related GroupHashMetadata rows,
# however, we have added multiple optimizations in this function that would need to
# be ported to a custom deletion task.
delete_group_hashes(instance_list[0].project_id, error_group_ids, seer_deletion=True)
delete_group_hashes(instance_list[0].project_id, issue_platform_group_ids)
delete_project_group_hashes(
instance_list[0].project_id, group_ids_filter=error_group_ids, seer_deletion=True
)
delete_project_group_hashes(
instance_list[0].project_id, group_ids_filter=issue_platform_group_ids
)

# If this isn't a retention cleanup also remove event data.
if not os.environ.get("_SENTRY_CLEANUP"):
Expand Down Expand Up @@ -259,21 +262,6 @@ def mark_deletion_in_progress(self, instance_list: Sequence[Group]) -> None:
).update(status=GroupStatus.DELETION_IN_PROGRESS, substatus=None)


def delete_project_group_hashes(project_id: int) -> None:
groups = []
for group in RangeQuerySetWrapper(
Group.objects.filter(project_id=project_id), step=GROUP_CHUNK_SIZE
):
groups.append(group)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was basically restricting the deletions to rows with group being set and missing all non-group related rows.

All rows for this project are null:

SELECT count(*) FROM `getsentry.sentry_grouphash` WHERE project_id IN (4506269346365440) and group_id is null;


error_groups, issue_platform_groups = separate_by_group_category(groups)
error_group_ids = [group.id for group in error_groups]
delete_group_hashes(project_id, error_group_ids, seer_deletion=True)

issue_platform_group_ids = [group.id for group in issue_platform_groups]
delete_group_hashes(project_id, issue_platform_group_ids)


def update_group_hash_metadata_in_batches(hash_ids: Sequence[int]) -> None:
"""
Update seer_matched_grouphash to None for GroupHashMetadata rows
Expand Down Expand Up @@ -323,41 +311,52 @@ def update_group_hash_metadata_in_batches(hash_ids: Sequence[int]) -> None:
metrics.incr("deletions.group_hash_metadata.max_iterations_reached", sample_rate=1.0)


def delete_group_hashes(
def delete_project_group_hashes(
project_id: int,
group_ids: Sequence[int],
group_ids_filter: Sequence[int] | None = None,
seer_deletion: bool = False,
) -> None:
# Validate batch size to ensure it's at least 1 to avoid ValueError in range()
"""
Delete GroupHash records for a project.

This is the main function for deleting GroupHash records. It can delete all hashes for a project
(used during project deletion to clean up orphaned records) or delete hashes for specific groups
(used during group deletion).

Args:
project_id: The project to delete hashes for
group_ids_filter: Optional filter for specific group IDs.
- If None: deletes ALL GroupHash records for the project (including orphans)
- If empty: returns immediately (no-op for safety)
- If non-empty: deletes only hashes for those specific groups
seer_deletion: Whether to notify Seer about the deletion
"""
# Safety: empty filter means nothing to delete
if group_ids_filter is not None and len(group_ids_filter) == 0:
return

hashes_batch_size = max(1, options.get("deletions.group-hashes-batch-size"))

# Set a reasonable upper bound on iterations to prevent infinite loops.
# The loop will naturally terminate when no more hashes are found.
iterations = 0
while iterations < GROUP_HASH_ITERATIONS:
qs = GroupHash.objects.filter(project_id=project_id, group_id__in=group_ids).values_list(
"id", "hash"
)[:hashes_batch_size]
hashes_chunk = list(qs)
# Base query: all hashes for this project
qs = GroupHash.objects.filter(project_id=project_id)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will allow querying for rows w/o having group being set.


# Apply group filter if provided
if group_ids_filter is not None:
qs = qs.filter(group_id__in=group_ids_filter)

hashes_chunk = list(qs.values_list("id", "hash")[:hashes_batch_size])
if not hashes_chunk:
break
try:
if seer_deletion:
# Tell seer to delete grouping records for these groups
# It's low priority to delete the hashes from seer, so we don't want
# any network errors to block the deletion of the groups
hash_values = [gh[1] for gh in hashes_chunk]
may_schedule_task_to_delete_hashes_from_seer(project_id, hash_values)
except Exception:
logger.warning("Error scheduling task to delete hashes from seer")
finally:
hash_ids = [gh[0] for gh in hashes_chunk]
# GroupHashMetadata rows can reference GroupHash rows via seer_matched_grouphash_id.
# Before deleting these GroupHash rows, we need to either:
# 1. Update seer_matched_grouphash to None first (to avoid foreign key constraint errors), OR
# 2. Delete the GroupHashMetadata rows entirely (they'll be deleted anyway)
# If we update the columns first, the deletion of the grouphash metadata rows will have less work to do,
# thus, improving the performance of the deletion.
update_group_hash_metadata_in_batches(hash_ids)
GroupHashMetadata.objects.filter(grouphash_id__in=hash_ids).delete()
GroupHash.objects.filter(id__in=hash_ids).delete()
Expand All @@ -367,8 +366,8 @@ def delete_group_hashes(
if iterations == GROUP_HASH_ITERATIONS:
metrics.incr("deletions.group_hashes.max_iterations_reached", sample_rate=1.0)
logger.warning(
"Group hashes batch deletion reached the maximum number of iterations. "
"Investigate if we need to change the GROUP_HASH_ITERATIONS value."
"delete_group_hashes.max_iterations_reached",
extra={"project_id": project_id, "filtered": group_ids_filter is not None},
)


Expand Down
67 changes: 66 additions & 1 deletion tests/sentry/deletions/test_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@
from snuba_sdk import Column, Condition, DeleteQuery, Entity, Function, Op, Query, Request

from sentry import deletions, nodestore
from sentry.deletions.defaults.group import update_group_hash_metadata_in_batches
from sentry.deletions.defaults.group import (
delete_project_group_hashes,
update_group_hash_metadata_in_batches,
)
from sentry.deletions.tasks.groups import delete_groups_for_project
from sentry.issues.grouptype import FeedbackGroup, GroupCategory
from sentry.issues.issue_occurrence import IssueOccurrence
Expand Down Expand Up @@ -389,6 +392,68 @@ def test_update_group_hash_metadata_in_batches(self) -> None:
else:
raise AssertionError("GroupHashMetadata is None for grouphash id=%s" % grouphash.id)

def test_delete_project_group_hashes_specific_groups(self) -> None:
"""Test deleting grouphashes for specific group IDs (including metadata) and empty list safety."""
self.project.update_option("sentry:similarity_backfill_completed", int(time()))

event_1 = self.store_event(
data={"platform": "python", "stacktrace": {"frames": [{"filename": "error_1.py"}]}},
project_id=self.project.id,
)
event_2 = self.store_event(
data={"platform": "python", "stacktrace": {"frames": [{"filename": "error_2.py"}]}},
project_id=self.project.id,
)

grouphash_1 = GroupHash.objects.get(group=event_1.group)
grouphash_2 = GroupHash.objects.get(group=event_2.group)
assert grouphash_1.metadata is not None
assert grouphash_2.metadata is not None
metadata_1_id = grouphash_1.metadata.id
metadata_2_id = grouphash_2.metadata.id

assert GroupHash.objects.filter(project=self.project).count() == 2

delete_project_group_hashes(
project_id=self.project.id,
group_ids_filter=[event_1.group.id],
)

assert not GroupHash.objects.filter(id=grouphash_1.id).exists()
assert not GroupHashMetadata.objects.filter(id=metadata_1_id).exists()
assert GroupHash.objects.filter(id=grouphash_2.id).exists()
assert GroupHashMetadata.objects.filter(id=metadata_2_id).exists()

# Empty list should be a no-op
delete_project_group_hashes(project_id=self.project.id, group_ids_filter=[])
assert GroupHash.objects.filter(id=grouphash_2.id).exists()

def test_delete_project_group_hashes_all_including_orphans(self) -> None:
"""Test deleting all grouphashes including orphans when group_ids_filter=None."""
self.project.update_option("sentry:similarity_backfill_completed", int(time()))

event = self.store_event(
data={"platform": "python", "stacktrace": {"frames": [{"filename": "error.py"}]}},
project_id=self.project.id,
)
grouphash = GroupHash.objects.get(group=event.group)
assert grouphash.metadata is not None
metadata_id = grouphash.metadata.id

orphan_1 = GroupHash.objects.create(project=self.project, hash="a" * 32, group=None)
orphan_2 = GroupHash.objects.create(project=self.project, hash="b" * 32, group=None)

assert GroupHash.objects.filter(project=self.project).count() == 3
assert GroupHash.objects.filter(project=self.project, group__isnull=True).count() == 2

delete_project_group_hashes(project_id=self.project.id, group_ids_filter=None)

assert not GroupHash.objects.filter(id=grouphash.id).exists()
assert not GroupHash.objects.filter(id=orphan_1.id).exists()
assert not GroupHash.objects.filter(id=orphan_2.id).exists()
assert not GroupHashMetadata.objects.filter(id=metadata_id).exists()
assert GroupHash.objects.filter(project=self.project).count() == 0


class DeleteIssuePlatformTest(TestCase, SnubaTestCase, OccurrenceTestMixin):
referrer = Referrer.TESTING_TEST.value
Expand Down
Loading