Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 26 additions & 65 deletions src/sentry/deletions/defaults/group.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
GROUP_CHUNK_SIZE = 100
EVENT_CHUNK_SIZE = 10000
GROUP_HASH_ITERATIONS = 10000
GROUP_HASH_METADATA_ITERATIONS = 10000

# Group models that relate only to groups and not to events. These models are
# transferred during reprocessing operations because they represent group-level
Expand Down Expand Up @@ -267,83 +268,46 @@ def update_group_hash_metadata_in_batches(hash_ids: Sequence[int]) -> None:

This function performs the update in smaller batches to reduce lock
contention and prevent statement timeouts when many rows need updating.
Uses cursor-based pagination with the primary key to avoid loading all
IDs into memory and to avoid growing NOT IN clauses.
Includes a maximum iteration limit as a safeguard against potential
infinite loops.
"""
option_batch_size = options.get("deletions.group-hash-metadata.batch-size", 1000)
option_batch_size = options.get("deletions.group-hash-metadata.batch-size")
batch_size = max(1, option_batch_size)

# Use cursor-based pagination with the primary key to efficiently
# process large datasets without loading all IDs into memory or
# creating large NOT IN clauses. We fetch IDs without ORDER BY to avoid
# database sorting overhead, then sort the small batch in Python.
last_max_id = 0
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was mentioned by @wedamija in here:
#102960 (comment)

while True:
# Process rows in batches with a maximum iteration limit to prevent
# infinite loops while still allowing processing of large datasets.
updated_rows = 0
iteration_count = 0
while iteration_count < GROUP_HASH_METADATA_ITERATIONS:
iteration_count += 1
# Note: hash_ids is bounded to ~100 items (deletions.group-hashes-batch-size)
# from the caller, so this IN clause is intentionally not batched
batch_metadata_ids = list(
GroupHashMetadata.objects.filter(
seer_matched_grouphash_id__in=hash_ids, id__gt=last_max_id
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't need id__gt=last_max_id since we don't care about ordering at all. Once the rows' seer_matched_grouphash column is updated (see block below), we will never be able to select those rows again.

GroupHashMetadata.objects.filter(id__in=batch_metadata_ids).update(
    seer_matched_grouphash=None
)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The main problem with this query is that we're using two columns, thus, it would require a composite index.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Postgres will pick one either the primary key, or foreign key index and then sequential scan from there. Going down to one column in the condition makes query planning simpler.

).values_list("id", flat=True)[:batch_size]
GroupHashMetadata.objects.filter(seer_matched_grouphash_id__in=hash_ids).values_list(
"id", flat=True
)[:batch_size]
)
if not batch_metadata_ids:
break

# Sort in Python to ensure we process lowest IDs first and can safely
# advance the cursor. Sorting a small batch (e.g., 1000 items) in Python
# is trivial and avoids database ORDER BY overhead.
batch_metadata_ids.sort()

updated = GroupHashMetadata.objects.filter(id__in=batch_metadata_ids).update(
seer_matched_grouphash=None
)
updated_rows += updated
metrics.incr("deletions.group_hash_metadata.rows_updated", amount=updated, sample_rate=1.0)
# It could be possible we could be trying to update the same rows again and again,
# thus, let's break the loop.
if updated == 0:
break

last_max_id = batch_metadata_ids[-1] # Last element after sorting


def update_group_hash_metadata_in_batches_old(hash_ids: Sequence[int]) -> int:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We're dropping the old code as the new query is better and has not caused any worse issues.

"""
Update seer_matched_grouphash to None for GroupHashMetadata rows
that reference the given hash_ids, in batches to avoid timeouts.

This function performs the update in smaller batches to reduce lock
contention and prevent statement timeouts when many rows need updating.

Returns the total number of rows updated.
"""
# First, get all the IDs that need updating
metadata_ids = list(
GroupHashMetadata.objects.filter(seer_matched_grouphash_id__in=hash_ids).values_list(
"id", flat=True
# We will try again these hash_ids on the next run of the cleanup script.
# This is a safeguard to prevent infinite loops.
if iteration_count >= GROUP_HASH_METADATA_ITERATIONS:
logger.warning(
"update_group_hash_metadata_in_batches.max_iterations_reached",
extra={"updated_rows": updated_rows},
)
)

if not metadata_ids:
return 0

option_batch_size = options.get("deletions.group-hash-metadata.batch-size", 1000)
batch_size = max(1, option_batch_size)
total_updated = 0
for i in range(0, len(metadata_ids), batch_size):
batch = metadata_ids[i : i + batch_size]
updated = GroupHashMetadata.objects.filter(id__in=batch).update(seer_matched_grouphash=None)
total_updated += updated

metrics.incr(
"deletions.group_hash_metadata.rows_updated",
amount=total_updated,
sample_rate=1.0,
)
logger.info(
"update_group_hash_metadata_in_batches.complete",
extra={
"hash_ids_count": len(hash_ids),
"total_updated": total_updated,
},
)

return total_updated
metrics.incr("deletions.group_hash_metadata.max_iterations_reached", sample_rate=1.0)


def delete_group_hashes(
Expand Down Expand Up @@ -381,10 +345,7 @@ def delete_group_hashes(
# 2. Delete the GroupHashMetadata rows entirely (they'll be deleted anyway)
# If we update the columns first, the deletion of the grouphash metadata rows will have less work to do,
# thus, improving the performance of the deletion.
if options.get("deletions.group-hash-metadata.use-old-update-method"):
update_group_hash_metadata_in_batches_old(hash_ids)
else:
update_group_hash_metadata_in_batches(hash_ids)
update_group_hash_metadata_in_batches(hash_ids)
GroupHashMetadata.objects.filter(grouphash_id__in=hash_ids).delete()
GroupHash.objects.filter(id__in=hash_ids).delete()

Expand Down
6 changes: 0 additions & 6 deletions src/sentry/options/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,12 +348,6 @@
type=Int,
flags=FLAG_AUTOMATOR_MODIFIABLE,
)
register(
"deletions.group-hash-metadata.use-old-update-method",
default=False, # Default using new update method
type=Bool,
flags=FLAG_AUTOMATOR_MODIFIABLE,
)


register(
Expand Down
Loading