Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(similarity): Add delete record by hash task #72767

Merged
merged 21 commits into from
Jun 26, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
ff6a517
feat(similarity-embedding): Delete grouping records on group delete
jangjodi May 28, 2024
071cb4e
fix: Add mock to test
jangjodi May 28, 2024
89278c8
ref: Separate changes
jangjodi Jun 13, 2024
0095b85
feat(similarity): Add delete record by hash task
jangjodi Jun 14, 2024
15a166d
Merge branch 'master' into jodi/delete-grouping-records-by-hash
jangjodi Jun 14, 2024
0a8f8fb
Merge branch 'jodi/delete-grouping-records-by-hash' into jodi/delete-…
jangjodi Jun 14, 2024
72b01fe
fix: Remove unneeded json.dumps
jangjodi Jun 14, 2024
7f9eabb
Merge remote-tracking branch 'origin' into jodi/delete-grouping-recor…
jangjodi Jun 14, 2024
ea9a59b
Merge branch 'master' into jodi/delete-grouping-records-by-hash
jangjodi Jun 20, 2024
c63330e
fix: Test
jangjodi Jun 20, 2024
e3b7268
ref: Add option for record delete batch size
jangjodi Jun 20, 2024
be7f3d7
fix: Test paths
jangjodi Jun 20, 2024
0231899
Merge branch 'jodi/delete-grouping-records-by-hash' into jodi/delete-…
jangjodi Jun 20, 2024
f429cab
fix: Option usage
jangjodi Jun 20, 2024
f606070
ref: Change batch size to 100
jangjodi Jun 20, 2024
a8f5733
fix: Test
jangjodi Jun 21, 2024
13cba90
Merge branch 'master' into jodi/delete-grouping-records-by-hash
jangjodi Jun 25, 2024
b44d88d
fix: Merge
jangjodi Jun 25, 2024
e147c4f
ref: Add timeout
jangjodi Jun 25, 2024
8694303
Merge branch 'jodi/delete-grouping-records-by-hash' into jodi/delete-…
jangjodi Jun 25, 2024
c4ecda3
Merge branch 'master' into jodi/delete-record-by-hash-task
jangjodi Jun 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion src/sentry/conf/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -3422,9 +3422,12 @@ def custom_parameter_sort(parameter: dict) -> tuple[str, int]:
SEER_GROUPING_RECORDS_URL = (
f"/{SEER_SIMILARITY_MODEL_VERSION}/issues/similar-issues/grouping-record"
)
SEER_GROUPING_RECORDS_DELETE_URL = (
SEER_PROJECT_GROUPING_RECORDS_DELETE_URL = (
f"/{SEER_SIMILARITY_MODEL_VERSION}/issues/similar-issues/grouping-record/delete"
)
SEER_HASH_GROUPING_RECORDS_DELETE_URL = (
f"/{SEER_SIMILARITY_MODEL_VERSION}/issues/similar-issues/grouping-record/delete-by-hash"
)

# TODO: Remove this soon, just a way to configure a project for this before we implement properly
UPTIME_POC_PROJECT_ID = 1
Expand Down
6 changes: 6 additions & 0 deletions src/sentry/options/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -929,6 +929,12 @@
flags=FLAG_ALLOW_EMPTY | FLAG_AUTOMATOR_MODIFIABLE,
)

register(
"embeddings-grouping.seer.delete-record-batch-size",
default=20,
flags=FLAG_ALLOW_EMPTY | FLAG_AUTOMATOR_MODIFIABLE,
)

# ## sentry.killswitches
#
# The following options are documented in sentry.killswitches in more detail
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@
from django.conf import settings
from urllib3.exceptions import ReadTimeoutError

from sentry.conf.server import SEER_GROUPING_RECORDS_DELETE_URL, SEER_GROUPING_RECORDS_URL
from sentry.conf.server import (
SEER_GROUPING_RECORDS_URL,
SEER_HASH_GROUPING_RECORDS_DELETE_URL,
SEER_PROJECT_GROUPING_RECORDS_DELETE_URL,
)
from sentry.net.http import connection_from_url
from sentry.seer.signed_seer_api import make_signed_seer_api_request
from sentry.seer.similarity.types import RawSeerSimilarIssueData
Expand Down Expand Up @@ -76,32 +80,62 @@ def post_bulk_grouping_records(
return {"success": False}


def delete_grouping_records(
def delete_project_grouping_records(
project_id: int,
) -> bool:
try:
# TODO: Move this over to POST json_api implementation
response = seer_grouping_connection_pool.urlopen(
"GET",
f"{SEER_GROUPING_RECORDS_DELETE_URL}/{project_id}",
f"{SEER_PROJECT_GROUPING_RECORDS_DELETE_URL}/{project_id}",
headers={"Content-Type": "application/json;charset=utf-8"},
timeout=POST_BULK_GROUPING_RECORDS_TIMEOUT,
)
except ReadTimeoutError:
logger.exception(
"seer.delete_grouping_records.timeout",
"seer.delete_grouping_records.project.timeout",
extra={"reason": "ReadTimeoutError", "timeout": POST_BULK_GROUPING_RECORDS_TIMEOUT},
)
return False

if response.status >= 200 and response.status < 300:
logger.info(
"seer.delete_grouping_records.success",
"seer.delete_grouping_records.project.success",
extra={"project_id": project_id},
)
return True
else:
logger.error(
"seer.delete_grouping_records.failure",
"seer.delete_grouping_records.project.failure",
)
return False


def delete_grouping_records_by_hash(project_id: int, hashes: list[str]) -> bool:
extra = {"project_id": project_id, "hashes": hashes}
try:
body = {"project_id": project_id, "hash_list": hashes}
response = seer_grouping_connection_pool.urlopen(
"POST",
SEER_HASH_GROUPING_RECORDS_DELETE_URL,
body=json.dumps(body),
headers={"Content-Type": "application/json;charset=utf-8"},
timeout=POST_BULK_GROUPING_RECORDS_TIMEOUT,
)
except ReadTimeoutError:
extra.update({"reason": "ReadTimeoutError", "timeout": POST_BULK_GROUPING_RECORDS_TIMEOUT})
logger.exception(
"seer.delete_grouping_records.hashes.timeout",
extra=extra,
)
return False

if response.status >= 200 and response.status < 300:
logger.info(
"seer.delete_grouping_records.hashes.success",
extra=extra,
)
return True
else:
logger.error("seer.delete_grouping_records.hashes.failure", extra=extra)
return False
31 changes: 31 additions & 0 deletions src/sentry/tasks/delete_seer_grouping_records_by_hash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from typing import Any

from sentry import options
from sentry.seer.similarity.grouping_records import delete_grouping_records_by_hash
from sentry.silo.base import SiloMode
from sentry.tasks.base import instrumented_task


@instrumented_task(
name="sentry.tasks.delete_seer_grouping_records_by_hash",
queue="delete_seer_grouping_records_by_hash",
max_retries=0,
silo_mode=SiloMode.REGION,
)
def delete_seer_grouping_records_by_hash(
project_id: int,
hashes: list[str],
last_deleted_index: int = 0,
*args: Any,
**kwargs: Any,
) -> None:
"""
Task to delete seer grouping records by hash list.
Calls the seer delete by hash endpoint with batches of hashes of size `BATCH_SIZE`.
"""
batch_size = options.get("embeddings-grouping.seer.delete-record-batch-size")
len_hashes = len(hashes)
end_index = min(last_deleted_index + batch_size, len_hashes)
delete_grouping_records_by_hash(project_id, hashes[last_deleted_index:end_index])
if end_index < len_hashes:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

any reason to call the task again instead of just calling the delete_grouping_records_by_hash function repetitively?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's based on the logic we used for the backfill - I didn't want this task to timeout, so I'm respawning the task instead

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is our current timeout set to on the celery task?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

haha whoops just realized I didn't even add a timeout, but I was thinking of using the same timeouts from the backfill soft_time_limit=60 * 15, time_limit=60 * (15 + 5)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

got it. i'd consider thinking about increasing the batch size to 100, and optionally doing some calls in a loop, so that way we have to spawn less celery tasks, as each one should run pretty quick (less than a second)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice, I didn't realize they were that quick - thanks!

delete_seer_grouping_records_by_hash.apply_async(args=[project_id, hashes, end_index])
6 changes: 3 additions & 3 deletions src/sentry/tasks/embeddings_grouping/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@
from sentry.issues.occurrence_consumer import EventLookupError
from sentry.models.group import Group, GroupStatus
from sentry.models.project import Project
from sentry.seer.similarity.backfill import (
from sentry.seer.similarity.grouping_records import (
CreateGroupingRecordData,
CreateGroupingRecordsRequest,
delete_grouping_records,
delete_project_grouping_records,
post_bulk_grouping_records,
)
from sentry.seer.similarity.types import (
Expand Down Expand Up @@ -469,7 +469,7 @@ def delete_seer_grouping_records(
"backfill_seer_grouping_records.delete_all_seer_records",
extra={"project_id": project_id},
)
delete_grouping_records(project_id)
delete_project_grouping_records(project_id)
redis_client.delete(make_backfill_redis_key(project_id))

for groups in chunked(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@
from urllib3.exceptions import ReadTimeoutError
from urllib3.response import HTTPResponse

from sentry.seer.similarity.backfill import (
from sentry.conf.server import SEER_HASH_GROUPING_RECORDS_DELETE_URL
from sentry.seer.similarity.grouping_records import (
POST_BULK_GROUPING_RECORDS_TIMEOUT,
CreateGroupingRecordsRequest,
delete_grouping_records_by_hash,
post_bulk_grouping_records,
)
from sentry.utils import json
Expand Down Expand Up @@ -39,8 +41,8 @@


@pytest.mark.django_db
@mock.patch("sentry.seer.similarity.backfill.logger")
@mock.patch("sentry.seer.similarity.backfill.seer_grouping_connection_pool.urlopen")
@mock.patch("sentry.seer.similarity.grouping_records.logger")
@mock.patch("sentry.seer.similarity.grouping_records.seer_grouping_connection_pool.urlopen")
def test_post_bulk_grouping_records_success(mock_seer_request: MagicMock, mock_logger: MagicMock):
expected_return_value = {
"success": True,
Expand All @@ -63,8 +65,8 @@ def test_post_bulk_grouping_records_success(mock_seer_request: MagicMock, mock_l


@pytest.mark.django_db
@mock.patch("sentry.seer.similarity.backfill.logger")
@mock.patch("sentry.seer.similarity.backfill.seer_grouping_connection_pool.urlopen")
@mock.patch("sentry.seer.similarity.grouping_records.logger")
@mock.patch("sentry.seer.similarity.grouping_records.seer_grouping_connection_pool.urlopen")
def test_post_bulk_grouping_records_timeout(mock_seer_request: MagicMock, mock_logger: MagicMock):
expected_return_value = {"success": False}
mock_seer_request.side_effect = ReadTimeoutError(
Expand All @@ -86,8 +88,8 @@ def test_post_bulk_grouping_records_timeout(mock_seer_request: MagicMock, mock_l


@pytest.mark.django_db
@mock.patch("sentry.seer.similarity.backfill.logger")
@mock.patch("sentry.seer.similarity.backfill.seer_grouping_connection_pool.urlopen")
@mock.patch("sentry.seer.similarity.grouping_records.logger")
@mock.patch("sentry.seer.similarity.grouping_records.seer_grouping_connection_pool.urlopen")
def test_post_bulk_grouping_records_failure(mock_seer_request: MagicMock, mock_logger: MagicMock):
expected_return_value = {"success": False}
mock_seer_request.return_value = HTTPResponse(
Expand All @@ -110,7 +112,7 @@ def test_post_bulk_grouping_records_failure(mock_seer_request: MagicMock, mock_l


@pytest.mark.django_db
@mock.patch("sentry.seer.similarity.backfill.seer_grouping_connection_pool.urlopen")
@mock.patch("sentry.seer.similarity.grouping_records.seer_grouping_connection_pool.urlopen")
def test_post_bulk_grouping_records_empty_data(mock_seer_request: MagicMock):
"""Test that function handles empty data. This should not happen, but we do not want to error if it does."""
expected_return_value = {"success": True}
Expand All @@ -121,3 +123,68 @@ def test_post_bulk_grouping_records_empty_data(mock_seer_request: MagicMock):
empty_data["data"] = []
response = post_bulk_grouping_records(empty_data)
assert response == expected_return_value


@mock.patch("sentry.seer.similarity.grouping_records.logger")
@mock.patch("sentry.seer.similarity.grouping_records.seer_grouping_connection_pool.urlopen")
def test_delete_grouping_records_by_hash_success(
mock_seer_request: MagicMock, mock_logger: MagicMock
):
mock_seer_request.return_value = HTTPResponse(
json.dumps({"success": True}).encode("utf-8"), status=200
)

project_id, hashes = 1, ["1", "2"]
response = delete_grouping_records_by_hash(project_id, hashes)
assert response is True
mock_logger.info.assert_called_with(
"seer.delete_grouping_records.hashes.success",
extra={
"hashes": hashes,
"project_id": project_id,
},
)


@mock.patch("sentry.seer.similarity.grouping_records.logger")
@mock.patch("sentry.seer.similarity.grouping_records.seer_grouping_connection_pool.urlopen")
def test_delete_grouping_records_by_hash_timeout(
mock_seer_request: MagicMock, mock_logger: MagicMock
):
mock_seer_request.side_effect = ReadTimeoutError(
DUMMY_POOL, SEER_HASH_GROUPING_RECORDS_DELETE_URL, "read timed out"
)
project_id, hashes = 1, ["1", "2"]
response = delete_grouping_records_by_hash(project_id, hashes)
assert response is False
mock_logger.exception.assert_called_with(
"seer.delete_grouping_records.hashes.timeout",
extra={
"hashes": hashes,
"project_id": project_id,
"reason": "ReadTimeoutError",
"timeout": POST_BULK_GROUPING_RECORDS_TIMEOUT,
},
)


@mock.patch("sentry.seer.similarity.grouping_records.logger")
@mock.patch("sentry.seer.similarity.grouping_records.seer_grouping_connection_pool.urlopen")
def test_delete_grouping_records_by_hash_failure(
mock_seer_request: MagicMock, mock_logger: MagicMock
):
mock_seer_request.return_value = HTTPResponse(
b"<!doctype html>\n<html lang=en>\n<title>500 Internal Server Error</title>\n<h1>Internal Server Error</h1>\n<p>The server encountered an internal error and was unable to complete your request. Either the server is overloaded or there is an error in the application.</p>\n",
reason="INTERNAL SERVER ERROR",
status=500,
)
project_id, hashes = 1, ["1", "2"]
response = delete_grouping_records_by_hash(project_id, hashes)
assert response is False
mock_logger.error.assert_called_with(
"seer.delete_grouping_records.hashes.failure",
extra={
"hashes": hashes,
"project_id": project_id,
},
)
8 changes: 4 additions & 4 deletions tests/sentry/tasks/test_backfill_seer_grouping_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from sentry.issues.occurrence_consumer import EventLookupError
from sentry.models.group import Group, GroupStatus
from sentry.models.grouphash import GroupHash
from sentry.seer.similarity.backfill import CreateGroupingRecordData
from sentry.seer.similarity.grouping_records import CreateGroupingRecordData
from sentry.seer.similarity.types import RawSeerSimilarIssueData
from sentry.snuba.dataset import Dataset
from sentry.snuba.referrer import Referrer
Expand Down Expand Up @@ -694,8 +694,8 @@ def test_backfill_seer_grouping_records_multiple_batches(self, mock_post_bulk_gr
assert last_processed_index == len(groups)

@with_feature("projects:similarity-embeddings-backfill")
@patch("sentry.tasks.embeddings_grouping.utils.delete_grouping_records")
def test_backfill_seer_grouping_records_only_delete(self, mock_delete_grouping_records):
@patch("sentry.tasks.embeddings_grouping.utils.delete_project_grouping_records")
def test_backfill_seer_grouping_records_only_delete(self, mock_project_delete_grouping_records):
"""
Test that when the only_delete flag is on, seer_similarity is deleted from the metadata
if it exists
Expand Down Expand Up @@ -724,7 +724,7 @@ def test_backfill_seer_grouping_records_only_delete(self, mock_delete_grouping_r
event.group.save()
group_ids.append(event.group.id)

mock_delete_grouping_records.return_value = True
mock_project_delete_grouping_records.return_value = True
with TaskRunner():
backfill_seer_grouping_records_for_project(self.project.id, None, only_delete=True)

Expand Down
24 changes: 24 additions & 0 deletions tests/sentry/tasks/test_delete_seer_grouping_records_by_hash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from unittest.mock import patch

from sentry.tasks.delete_seer_grouping_records_by_hash import delete_seer_grouping_records_by_hash
from sentry.testutils.pytest.fixtures import django_db_all


@django_db_all
@patch("sentry.tasks.delete_seer_grouping_records_by_hash.delete_grouping_records_by_hash")
@patch(
"sentry.tasks.delete_seer_grouping_records_by_hash.delete_seer_grouping_records_by_hash.apply_async"
)
def test_delete_seer_grouping_records_by_hash_batches(
mock_delete_seer_grouping_records_by_hash_apply_async, mock_delete_grouping_records_by_hash
):
"""
Test that when delete_seer_grouping_records_by_hash is called with over 20 hashes, it spawns
another task with the end index of the previous batch.
"""
mock_delete_grouping_records_by_hash.return_value = True
project_id, hashes = 1, [str(i) for i in range(21)]
delete_seer_grouping_records_by_hash(project_id, hashes, 0)
assert mock_delete_seer_grouping_records_by_hash_apply_async.call_args[1] == {
"args": [project_id, hashes, 20]
}
Loading