Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(code-mappings): Add new task to find projects with missing code mappings #40271

Merged
merged 11 commits into from Oct 24, 2022
1 change: 1 addition & 0 deletions mypy.ini
Expand Up @@ -113,6 +113,7 @@ files = fixtures/mypy-stubs,
src/sentry/tasks/store.py,
src/sentry/tasks/symbolication.py,
src/sentry/tasks/update_user_reports.py,
src/sentry/tasks/derive_code_mappings.py,
src/sentry/testutils/modelmanifest.py,
src/sentry/testutils/silo.py,
src/sentry/types/region.py,
Expand Down
1 change: 1 addition & 0 deletions src/sentry/conf/server.py
Expand Up @@ -684,6 +684,7 @@ def SOCIAL_AUTH_DEFAULT_USERNAME():
Queue("replays.delete_replay", routing_key="replays.delete_replay"),
Queue("counters-0", routing_key="counters-0"),
Queue("triggers-0", routing_key="triggers-0"),
Queue("derive_code_mappings", routing_key="derive_code_mappings"),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You'll need to speak to ops to have them assign workers to this queue

]

for queue in CELERY_QUEUES:
Expand Down
100 changes: 100 additions & 0 deletions src/sentry/tasks/derive_code_mappings.py
@@ -0,0 +1,100 @@
import logging
from datetime import timedelta
from typing import Any, List, Mapping, Optional, Set, Tuple

from django.utils import timezone

from sentry.db.models.fields.node import NodeData
from sentry.models import Project
from sentry.models.group import Group
from sentry.models.organization import Organization, OrganizationStatus
from sentry.tasks.base import instrumented_task
from sentry.utils.safe import get_path

ACTIVE_PROJECT_THRESHOLD = timedelta(days=7)
GROUP_ANALYSIS_RANGE = timedelta(days=14)

logger = logging.getLogger("sentry.tasks.derive_code_mappings")


@instrumented_task( # type: ignore
name="sentry.tasks.derive_code_mappings.identify_stacktrace_paths",
queue="derive_code_mappings",
max_retries=0, # if we don't backfill it this time, we'll get it the next time
)
def identify_stacktrace_paths(
organizations: Optional[List[Organization]] = None,
) -> Mapping[str, Mapping[str, List[str]]]:
Comment on lines +20 to +27
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How will this be called, and how many orgs are we generally likely to pass?

"""
Generate a map of projects to stacktrace paths for specified organizations,
or all active organizations if unspecified.

This filters out projects have not had an event in the last 7 days or have
non-python files in the stacktrace.
"""
if organizations is None:
organizations = Organization.objects.filter(status=OrganizationStatus.ACTIVE)

filename_maps = {}
for org in organizations:
projects = Project.objects.filter(organization=org, first_event__isnull=False)
projects = [
project
for project in projects
if Group.objects.filter(
project=project,
last_seen__gte=timezone.now() - ACTIVE_PROJECT_THRESHOLD,
).exists()
]

project_file_map = {project.slug: get_all_stacktrace_paths(project) for project in projects}
filename_maps[org.slug] = project_file_map
return filename_maps


def get_all_stacktrace_paths(project: Project) -> List[str]:
groups = Group.objects.filter(
project=project, last_seen__gte=timezone.now() - GROUP_ANALYSIS_RANGE
)

Comment on lines +56 to +59
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could be a significant number of groups, depending on project. If it is too many, you might hit OOM issues.

Since you're just processing one at a time, you could use RangeQuerysetWrapper to keep memory usage bounded.

I'm also not totally sure you need to fetch all groups from this time range. It seems like you mostly just want to sample a few groups and check their stack trace? Another way to do this is to query snuba for events from the last GROUP_ANALYSIS_RANGE and get a distinct list of platforms from them. That would allow this to be done with one query per project

Copy link
Member

@wedamija wedamija Oct 21, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, even better, you can make use of ProjectPlatform, which is generated by this task

def collect_project_platforms(paginate=1000, **kwargs):
now = timezone.now()
for page_of_project_ids in paginate_project_ids(paginate):
queryset = (
Group.objects.using_replica()
.filter(
last_seen__gte=now - timedelta(days=1),
project_id__in=page_of_project_ids,
platform__isnull=False,
)
.values_list("platform", "project_id")
.distinct()
)
for platform, project_id in queryset:
platform = platform.lower()
if platform not in VALID_PLATFORMS:
continue
ProjectPlatform.objects.create_or_update(
project_id=project_id, platform=platform, values={"last_seen": now}
)
# remove (likely) unused platform associations
ProjectPlatform.objects.filter(last_seen__lte=now - timedelta(days=90)).delete()

Should be fine to just look at this - the platform passed via events is stored on the group, and we typically trust the platform passed by the sdk

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually I think I'm misunderstanding what this is doing, where are these paths being used in general?

all_stacktrace_paths = set()
for group in groups:
event = group.get_latest_event()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doing this once per group will likely be quite slow since this will be making n+1 queries. I think you should be able to batch these by passing a list of group ids to a snuba query.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is_python_stacktrace, stacktrace_paths = get_stacktrace_paths(event.data)
if not is_python_stacktrace:
return []
all_stacktrace_paths.update(stacktrace_paths)

return list(all_stacktrace_paths)


def get_stacktrace_paths(data: NodeData) -> Tuple[bool, Set[str]]:
"""
Get the stacktrace_paths from the stacktrace for the latest event for an issue.
"""
stacktraces = get_stacktrace(data)
stacktrace_paths = set()
for stacktrace in stacktraces:
try:
paths = [frame["filename"] for frame in stacktrace["frames"]]
if len(paths) == 0:
continue
if paths[0].endswith(".py"):
stacktrace_paths.update(paths)
else:
return False, set() # (is_python, stacktrace_paths)
except Exception:
logger.exception("Error getting filenames for project {project.slug}")
return True, stacktrace_paths # (is_python, stacktrace_paths)


def get_stacktrace(data: NodeData) -> List[Mapping[str, Any]]:
exceptions = get_path(data, "exception", "values", filter=True)
if exceptions:
return [e["stacktrace"] for e in exceptions if get_path(e, "stacktrace", "frames")]

stacktrace = data.get("stacktrace")
if stacktrace and stacktrace.get("frames"):
return [stacktrace]

return []
145 changes: 145 additions & 0 deletions tests/sentry/tasks/test_derive_code_mappings.py
@@ -0,0 +1,145 @@
from copy import deepcopy

from sentry.models.organization import OrganizationStatus
from sentry.tasks.derive_code_mappings import identify_stacktrace_paths
from sentry.testutils import TestCase
from sentry.testutils.helpers.datetime import before_now, iso_format


class TestCommitContext(TestCase):
def setUp(self):
self.organization = self.create_organization(status=OrganizationStatus.ACTIVE)
self.project = self.create_project(organization=self.organization)
self.test_data_1 = {
"message": "Kaboom!",
"platform": "python",
"timestamp": iso_format(before_now(days=1)),
"stacktrace": {
"frames": [
{
"function": "handle_set_commits",
"abs_path": "/usr/src/sentry/src/sentry/tasks.py",
"module": "sentry.tasks",
"in_app": False,
"lineno": 30,
"filename": "sentry/tasks.py",
},
{
"function": "set_commits",
"abs_path": "/usr/src/sentry/src/sentry/models/release.py",
"module": "sentry.models.release",
"in_app": True,
"lineno": 39,
"filename": "sentry/models/release.py",
},
]
},
"fingerprint": ["put-me-in-the-control-group"],
}
self.test_data_2 = deepcopy(self.test_data_1)
self.test_data_2["stacktrace"]["frames"][0]["filename"] = "sentry/test_file.py"
self.test_data_2["stacktrace"]["frames"][1]["filename"] = "sentry/models/test_file.py"
self.test_data_2["fingerprint"] = ["new-group"]
self.test_data_2["timestamp"] = iso_format(before_now(days=2))

def test_finds_stacktrace_paths_single_project(self):
self.store_event(data=self.test_data_1, project_id=self.project.id)

with self.tasks():
mapping = identify_stacktrace_paths([self.organization])
assert self.organization.slug in mapping

stacktrace_paths = mapping[self.organization.slug]
assert self.project.slug in stacktrace_paths
assert sorted(stacktrace_paths[self.project.slug]) == [
"sentry/models/release.py",
"sentry/tasks.py",
]

def test_finds_stacktrace_paths_multiple_projects(self):
project_2 = self.create_project(organization=self.organization)
self.store_event(data=self.test_data_1, project_id=self.project.id)
self.store_event(data=self.test_data_2, project_id=project_2.id)

with self.tasks():
mapping = identify_stacktrace_paths([self.organization])
assert self.organization.slug in mapping
stacktrace_paths = mapping[self.organization.slug]
assert self.project.slug in stacktrace_paths
assert sorted(stacktrace_paths[self.project.slug]) == [
"sentry/models/release.py",
"sentry/tasks.py",
]
assert project_2.slug in stacktrace_paths
assert sorted(stacktrace_paths[project_2.slug]) == [
"sentry/models/test_file.py",
"sentry/test_file.py",
]

def test_finds_stacktrace_paths_multiple_orgs(self):
new_org = self.create_organization()
new_project = self.create_project(organization=new_org)
self.store_event(self.test_data_1, project_id=self.project.id)
self.store_event(data=self.test_data_2, project_id=new_project.id)

with self.tasks():
mapping = identify_stacktrace_paths([self.organization, new_org])
assert self.organization.slug in mapping
stacktrace_paths = mapping[self.organization.slug]
assert self.project.slug in stacktrace_paths
assert sorted(stacktrace_paths[self.project.slug]) == [
"sentry/models/release.py",
"sentry/tasks.py",
]
assert new_org.slug in mapping
stacktrace_paths = mapping[new_org.slug]
assert new_project.slug in stacktrace_paths
assert sorted(stacktrace_paths[new_project.slug]) == [
"sentry/models/test_file.py",
"sentry/test_file.py",
]

def test_skips_stale_projects(self):
stale_event = deepcopy(self.test_data_1)
stale_event["timestamp"] = iso_format(before_now(days=8))
self.store_event(data=stale_event, project_id=self.project.id)

with self.tasks():
mapping = identify_stacktrace_paths()
assert self.organization.slug in mapping
stacktrace_paths = mapping[self.organization.slug]
assert self.project.slug not in stacktrace_paths

def test_skips_outdated_events(self):
stale_event = deepcopy(self.test_data_2)
stale_event["timestamp"] = iso_format(before_now(days=16))
self.store_event(data=self.test_data_1, project_id=self.project.id)
self.store_event(data=stale_event, project_id=self.project.id)

with self.tasks():
mapping = identify_stacktrace_paths([self.organization])
assert self.organization.slug in mapping
stacktrace_paths = mapping[self.organization.slug]
assert self.project.slug in stacktrace_paths
assert sorted(stacktrace_paths[self.project.slug]) == [
"sentry/models/release.py",
"sentry/tasks.py",
]

def test_handle_duplicate_filenames_in_a_project(self):
self.store_event(data=self.test_data_1, project_id=self.project.id)
duplicate_event = deepcopy(self.test_data_2)
duplicate_event["stacktrace"]["frames"].append(self.test_data_1["stacktrace"]["frames"][0])
self.store_event(data=duplicate_event, project_id=self.project.id)

with self.tasks():
mapping = identify_stacktrace_paths([self.organization])
assert self.organization.slug in mapping
stacktrace_paths = mapping[self.organization.slug]
assert self.project.slug in stacktrace_paths
assert sorted(stacktrace_paths[self.project.slug]) == [
"sentry/models/release.py",
"sentry/models/test_file.py",
"sentry/tasks.py",
"sentry/test_file.py",
]