Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions src/sentry/deletions/defaults/group.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,13 @@
from sentry.issues.grouptype import GroupCategory, InvalidGroupTypeError
from sentry.models.group import Group, GroupStatus
from sentry.models.grouphash import GroupHash
from sentry.models.grouphashmetadata import GroupHashMetadata
from sentry.models.rulefirehistory import RuleFireHistory
from sentry.notifications.models.notificationmessage import NotificationMessage
from sentry.services.eventstore.models import Event
from sentry.snuba.dataset import Dataset
from sentry.tasks.delete_seer_grouping_records import may_schedule_task_to_delete_hashes_from_seer
from sentry.utils import metrics
from sentry.utils.query import RangeQuerySetWrapper

from ..base import BaseDeletionTask, BaseRelation, ModelDeletionTask, ModelRelation
Expand Down Expand Up @@ -256,10 +258,38 @@ def delete_group_hashes(
logger.warning("Error scheduling task to delete hashes from seer")
finally:
hash_ids = [gh[0] for gh in hashes_chunk]
if options.get("deletions.group.delete_group_hashes_metadata_first"):
# If we delete the grouphash metadata rows first we will not need to update the references to the other grouphashes.
# If we try to delete the group hashes first, then it will require the updating of the columns first.
#
# To understand this, let's say we have the following relationships:
# gh A -> ghm A -> no reference to another grouphash
# gh B -> ghm B -> gh C
# gh C -> ghm C -> gh A
#
# Deleting group hashes A, B & C (since they all point to the same group) will require:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function is called as this: delete_group_hashes(project_id, group_ids), thus, all group hashes point to the same group.

Image

# * Updating columns ghmB & ghmC to point to None
# * Deleting the group hash metadata rows
# * Deleting the group hashes
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Before this PR, the current approach is this:

  • Start group hashes deletion
    • Which triggers group hash metadata column updating
    • Which then triggers deleting the group hash metadata rows
  • Now that children group hash metadata are deleted we can delete the group hashes

#
# If we delete the metadata first, we will not need to update the columns before deleting them.
try:
GroupHashMetadata.objects.filter(grouphash_id__in=hash_ids).delete()
except Exception:
# XXX: Let's make sure that no issues are caused by this and then remove it
logger.exception("Error deleting group hash metadata")
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Once I enable the option, I would like to know if any problems are caused by this while falling back to the original behaviour rather than completely aborting the process.


GroupHash.objects.filter(id__in=hash_ids).delete()

iterations += 1

if iterations == GROUP_HASH_ITERATIONS:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Drive-by metric.

metrics.incr("deletions.group_hashes.max_iterations_reached", sample_rate=1.0)
logger.warning(
"Group hashes batch deletion reached the maximum number of iterations. "
"Investigate if we need to change the GROUP_HASH_ITERATIONS value."
)


def separate_by_group_category(instance_list: Sequence[Group]) -> tuple[list[Group], list[Group]]:
error_groups = []
Expand Down
3 changes: 2 additions & 1 deletion src/sentry/models/grouphash.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ def metadata(self) -> GroupHashMetadata | None:
except AttributeError:
return None

__repr__ = sane_repr("group_id", "hash")
__repr__ = sane_repr("group_id", "hash", "metadata")
__str__ = __repr__
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It makes print statements during debugging actually useful.


def get_associated_fingerprint(self) -> list[str] | None:
"""
Expand Down
3 changes: 2 additions & 1 deletion src/sentry/models/grouphashmetadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,8 @@ def group_id(self) -> int | None:
def hash(self) -> str:
return self.grouphash.hash

__repr__ = sane_repr("grouphash_id", "group_id", "hash")
__repr__ = sane_repr("grouphash_id", "group_id", "hash", "seer_matched_grouphash")
__str__ = __repr__

def get_best_guess_schema_version(self) -> str:
"""
Expand Down
9 changes: 8 additions & 1 deletion src/sentry/options/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,14 @@
# Deletions
register(
"deletions.group-hashes-batch-size",
default=10000,
default=100,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

type=Int,
flags=FLAG_AUTOMATOR_MODIFIABLE,
)
register(
"deletions.group.delete_group_hashes_metadata_first",
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will control the new behaviour.

default=False,
type=Bool,
flags=FLAG_AUTOMATOR_MODIFIABLE,
)

Expand Down
70 changes: 69 additions & 1 deletion tests/sentry/deletions/test_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from snuba_sdk import Column, Condition, Entity, Function, Op, Query, Request

from sentry import nodestore
from sentry import deletions, nodestore
from sentry.deletions.defaults.group import ErrorEventsDeletionTask
from sentry.deletions.tasks.groups import delete_groups_for_project
from sentry.issues.grouptype import FeedbackGroup, GroupCategory
Expand All @@ -16,6 +16,7 @@
from sentry.models.group import Group
from sentry.models.groupassignee import GroupAssignee
from sentry.models.grouphash import GroupHash
from sentry.models.grouphashmetadata import GroupHashMetadata
from sentry.models.grouphistory import GroupHistory, GroupHistoryStatus
from sentry.models.groupmeta import GroupMeta
from sentry.models.groupredirect import GroupRedirect
Expand Down Expand Up @@ -254,6 +255,73 @@ def test_invalid_group_type_handling(
"args": [self.project.id, error_group_hashes, 0]
}

def test_delete_grouphashes_and_metadata(self) -> None:
"""
Test that when deleting group hashes, the group hash metadata is deleted first and the references to the other group hashes are updated.
"""
# This enables checking Seer for similarity and to mock the call to return a specific grouphash
self.project.update_option("sentry:similarity_backfill_completed", int(time()))

# Create two events/grouphashes and one of them
# Event A will be deleted
event_a = self.store_event(
data={
"platform": "python",
"stacktrace": {"frames": [{"filename": "error_a.py"}]},
},
project_id=self.project.id,
)
grouphash_a = GroupHash.objects.get(group_id=event_a.group_id)
assert grouphash_a.metadata is not None
assert grouphash_a.metadata.seer_matched_grouphash is None
metadata_a_id = grouphash_a.metadata.id

with mock.patch(
"sentry.grouping.ingest.seer.get_seer_similar_issues"
) as mock_get_seer_similar_issues:
# This will allow grouphash_b to be matched to grouphash_a by Seer
mock_get_seer_similar_issues.return_value = (0.01, grouphash_a)

# Event B will be kept - different exception to ensure different group hash to grouphash_a
event_b = self.store_event(
data={
"platform": "python",
"stacktrace": {"frames": [{"filename": "error_b.py"}]},
},
project_id=self.project.id,
)
grouphash_b = GroupHash.objects.get(hash=event_b.get_primary_hash())
# assert grouphash_b.metadata is not None
assert grouphash_b.metadata is not None
metadata_b_id = grouphash_b.metadata.id

# Verify that seer matched event_b to event_a's hash
assert event_a.group_id == event_b.group_id
# Make sure it has not changed
assert grouphash_a.metadata is not None
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The django ORM record won't change even if the underlying db record was changed. You could add grouphash_a.refresh_from_db() to ensure that you have the latest data from postgres.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I forgot to push that change. I have it locally. I will make it part of my next PR.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added this in #101796

assert grouphash_a.metadata.seer_matched_grouphash is None
assert grouphash_b.metadata is not None
assert grouphash_b.metadata.seer_matched_grouphash == grouphash_a
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This verifies that we have the column set to the first group hash. Not that we end up doing the update statement, however, I want to verify that we're testing the same code path.


with self.tasks():
# It will delete all groups, group hashes and group hash metadata
task = deletions.get(model=Group, query={"id__in": [event_a.group_id]})
more = task.chunk()
assert not more

assert not Group.objects.filter(id=event_a.group_id).exists()
assert not GroupHash.objects.filter(id=grouphash_a.id).exists()
assert not GroupHashMetadata.objects.filter(id=metadata_a_id).exists()
assert not GroupHash.objects.filter(id=grouphash_b.id).exists()
assert not GroupHashMetadata.objects.filter(id=metadata_b_id).exists()

def test_delete_grouphashes_and_metadata_and_metadata_but_delete_metadata_first(self) -> None:
"""
Test that when deleting group hashes, the group hash metadata is deleted first (which will not update the references to the other group hashes)
"""
with self.options({"deletions.group.delete_group_hashes_metadata_first": True}):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The two tests are functionally the same.
This test avoids the update call.

self.test_delete_grouphashes_and_metadata()


class DeleteIssuePlatformTest(TestCase, SnubaTestCase, OccurrenceTestMixin):
referrer = Referrer.TESTING_TEST.value
Expand Down
Loading