diff --git a/src/sentry/deletions/defaults/group.py b/src/sentry/deletions/defaults/group.py index 9de6f8fc3c81e7..1fd16b8647257f 100644 --- a/src/sentry/deletions/defaults/group.py +++ b/src/sentry/deletions/defaults/group.py @@ -11,11 +11,13 @@ from sentry.issues.grouptype import GroupCategory, InvalidGroupTypeError from sentry.models.group import Group, GroupStatus from sentry.models.grouphash import GroupHash +from sentry.models.grouphashmetadata import GroupHashMetadata from sentry.models.rulefirehistory import RuleFireHistory from sentry.notifications.models.notificationmessage import NotificationMessage from sentry.services.eventstore.models import Event from sentry.snuba.dataset import Dataset from sentry.tasks.delete_seer_grouping_records import may_schedule_task_to_delete_hashes_from_seer +from sentry.utils import metrics from sentry.utils.query import RangeQuerySetWrapper from ..base import BaseDeletionTask, BaseRelation, ModelDeletionTask, ModelRelation @@ -256,10 +258,38 @@ def delete_group_hashes( logger.warning("Error scheduling task to delete hashes from seer") finally: hash_ids = [gh[0] for gh in hashes_chunk] + if options.get("deletions.group.delete_group_hashes_metadata_first"): + # If we delete the grouphash metadata rows first we will not need to update the references to the other grouphashes. + # If we try to delete the group hashes first, then it will require the updating of the columns first. + # + # To understand this, let's say we have the following relationships: + # gh A -> ghm A -> no reference to another grouphash + # gh B -> ghm B -> gh C + # gh C -> ghm C -> gh A + # + # Deleting group hashes A, B & C (since they all point to the same group) will require: + # * Updating columns ghmB & ghmC to point to None + # * Deleting the group hash metadata rows + # * Deleting the group hashes + # + # If we delete the metadata first, we will not need to update the columns before deleting them. + try: + GroupHashMetadata.objects.filter(grouphash_id__in=hash_ids).delete() + except Exception: + # XXX: Let's make sure that no issues are caused by this and then remove it + logger.exception("Error deleting group hash metadata") + GroupHash.objects.filter(id__in=hash_ids).delete() iterations += 1 + if iterations == GROUP_HASH_ITERATIONS: + metrics.incr("deletions.group_hashes.max_iterations_reached", sample_rate=1.0) + logger.warning( + "Group hashes batch deletion reached the maximum number of iterations. " + "Investigate if we need to change the GROUP_HASH_ITERATIONS value." + ) + def separate_by_group_category(instance_list: Sequence[Group]) -> tuple[list[Group], list[Group]]: error_groups = [] diff --git a/src/sentry/models/grouphash.py b/src/sentry/models/grouphash.py index d0e4d7b0db6695..34f483f946fe1e 100644 --- a/src/sentry/models/grouphash.py +++ b/src/sentry/models/grouphash.py @@ -53,7 +53,8 @@ def metadata(self) -> GroupHashMetadata | None: except AttributeError: return None - __repr__ = sane_repr("group_id", "hash") + __repr__ = sane_repr("group_id", "hash", "metadata") + __str__ = __repr__ def get_associated_fingerprint(self) -> list[str] | None: """ diff --git a/src/sentry/models/grouphashmetadata.py b/src/sentry/models/grouphashmetadata.py index 986952b0c6a60d..e01de4a481b0b4 100644 --- a/src/sentry/models/grouphashmetadata.py +++ b/src/sentry/models/grouphashmetadata.py @@ -131,7 +131,8 @@ def group_id(self) -> int | None: def hash(self) -> str: return self.grouphash.hash - __repr__ = sane_repr("grouphash_id", "group_id", "hash") + __repr__ = sane_repr("grouphash_id", "group_id", "hash", "seer_matched_grouphash") + __str__ = __repr__ def get_best_guess_schema_version(self) -> str: """ diff --git a/src/sentry/options/defaults.py b/src/sentry/options/defaults.py index 520442f5883bfe..73e22288935e5a 100644 --- a/src/sentry/options/defaults.py +++ b/src/sentry/options/defaults.py @@ -338,7 +338,14 @@ # Deletions register( "deletions.group-hashes-batch-size", - default=10000, + default=100, + type=Int, + flags=FLAG_AUTOMATOR_MODIFIABLE, +) +register( + "deletions.group.delete_group_hashes_metadata_first", + default=False, + type=Bool, flags=FLAG_AUTOMATOR_MODIFIABLE, ) diff --git a/tests/sentry/deletions/test_group.py b/tests/sentry/deletions/test_group.py index 29099a44c5b74b..de230707d9474a 100644 --- a/tests/sentry/deletions/test_group.py +++ b/tests/sentry/deletions/test_group.py @@ -7,7 +7,7 @@ from snuba_sdk import Column, Condition, Entity, Function, Op, Query, Request -from sentry import nodestore +from sentry import deletions, nodestore from sentry.deletions.defaults.group import ErrorEventsDeletionTask from sentry.deletions.tasks.groups import delete_groups_for_project from sentry.issues.grouptype import FeedbackGroup, GroupCategory @@ -16,6 +16,7 @@ from sentry.models.group import Group from sentry.models.groupassignee import GroupAssignee from sentry.models.grouphash import GroupHash +from sentry.models.grouphashmetadata import GroupHashMetadata from sentry.models.grouphistory import GroupHistory, GroupHistoryStatus from sentry.models.groupmeta import GroupMeta from sentry.models.groupredirect import GroupRedirect @@ -254,6 +255,73 @@ def test_invalid_group_type_handling( "args": [self.project.id, error_group_hashes, 0] } + def test_delete_grouphashes_and_metadata(self) -> None: + """ + Test that when deleting group hashes, the group hash metadata is deleted first and the references to the other group hashes are updated. + """ + # This enables checking Seer for similarity and to mock the call to return a specific grouphash + self.project.update_option("sentry:similarity_backfill_completed", int(time())) + + # Create two events/grouphashes and one of them + # Event A will be deleted + event_a = self.store_event( + data={ + "platform": "python", + "stacktrace": {"frames": [{"filename": "error_a.py"}]}, + }, + project_id=self.project.id, + ) + grouphash_a = GroupHash.objects.get(group_id=event_a.group_id) + assert grouphash_a.metadata is not None + assert grouphash_a.metadata.seer_matched_grouphash is None + metadata_a_id = grouphash_a.metadata.id + + with mock.patch( + "sentry.grouping.ingest.seer.get_seer_similar_issues" + ) as mock_get_seer_similar_issues: + # This will allow grouphash_b to be matched to grouphash_a by Seer + mock_get_seer_similar_issues.return_value = (0.01, grouphash_a) + + # Event B will be kept - different exception to ensure different group hash to grouphash_a + event_b = self.store_event( + data={ + "platform": "python", + "stacktrace": {"frames": [{"filename": "error_b.py"}]}, + }, + project_id=self.project.id, + ) + grouphash_b = GroupHash.objects.get(hash=event_b.get_primary_hash()) + # assert grouphash_b.metadata is not None + assert grouphash_b.metadata is not None + metadata_b_id = grouphash_b.metadata.id + + # Verify that seer matched event_b to event_a's hash + assert event_a.group_id == event_b.group_id + # Make sure it has not changed + assert grouphash_a.metadata is not None + assert grouphash_a.metadata.seer_matched_grouphash is None + assert grouphash_b.metadata is not None + assert grouphash_b.metadata.seer_matched_grouphash == grouphash_a + + with self.tasks(): + # It will delete all groups, group hashes and group hash metadata + task = deletions.get(model=Group, query={"id__in": [event_a.group_id]}) + more = task.chunk() + assert not more + + assert not Group.objects.filter(id=event_a.group_id).exists() + assert not GroupHash.objects.filter(id=grouphash_a.id).exists() + assert not GroupHashMetadata.objects.filter(id=metadata_a_id).exists() + assert not GroupHash.objects.filter(id=grouphash_b.id).exists() + assert not GroupHashMetadata.objects.filter(id=metadata_b_id).exists() + + def test_delete_grouphashes_and_metadata_and_metadata_but_delete_metadata_first(self) -> None: + """ + Test that when deleting group hashes, the group hash metadata is deleted first (which will not update the references to the other group hashes) + """ + with self.options({"deletions.group.delete_group_hashes_metadata_first": True}): + self.test_delete_grouphashes_and_metadata() + class DeleteIssuePlatformTest(TestCase, SnubaTestCase, OccurrenceTestMixin): referrer = Referrer.TESTING_TEST.value