Skip to content

Commit

Permalink
ref(crons): Normalize crons incident issues (#70289)
Browse files Browse the repository at this point in the history
Prior to incidents we created issues for each type of faiure (error,
timeout, missed). This is because only one failed check-in was needed to
create an issue. With incidents you can configure how many failures are
needed, meaning there could be 2 missed, 1 timeout, and 1 error.

This removes the various issue occurrence types and replaces them with a
single MonitorIncidentType
  • Loading branch information
evanpurkhiser committed May 7, 2024
1 parent a7de04d commit 9d56889
Show file tree
Hide file tree
Showing 10 changed files with 39 additions and 322 deletions.
27 changes: 7 additions & 20 deletions src/sentry/issues/grouptype.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,10 +247,6 @@ class PerformanceGroupTypeDefaults:
noise_config = NoiseConfig()


class CronGroupTypeDefaults:
notification_config = NotificationConfig(context=[])


class ReplayGroupTypeDefaults:
notification_config = NotificationConfig(context=[])

Expand Down Expand Up @@ -518,36 +514,27 @@ class ProfileFunctionRegressionType(GroupType):


@dataclass(frozen=True)
class MonitorCheckInFailure(CronGroupTypeDefaults, GroupType):
class MonitorIncidentType(GroupType):
type_id = 4001
slug = "monitor_check_in_failure"
description = "Monitor Check In Failed"
description = "Crons Monitor Failed"
category = GroupCategory.CRON.value
released = True
creation_quota = Quota(3600, 60, 60_000) # 60,000 per hour, sliding window of 60 seconds
default_priority = PriorityLevel.HIGH
notification_config = NotificationConfig(context=[])


@dataclass(frozen=True)
class MonitorCheckInTimeout(CronGroupTypeDefaults, GroupType):
class MonitorCheckInTimeoutDeprecated(MonitorIncidentType, GroupType):
# This is deprecated, only kept around for it's type_id
type_id = 4002
slug = "monitor_check_in_timeout"
description = "Monitor Check In Timeout"
category = GroupCategory.CRON.value
released = True
creation_quota = Quota(3600, 60, 60_000) # 60,000 per hour, sliding window of 60 seconds
default_priority = PriorityLevel.HIGH


@dataclass(frozen=True)
class MonitorCheckInMissed(CronGroupTypeDefaults, GroupType):
class MonitorCheckInMissedDeprecated(MonitorIncidentType, GroupType):
# This is deprecated, only kept around for it's type_id
type_id = 4003
slug = "monitor_check_in_missed"
description = "Monitor Check In Missed"
category = GroupCategory.CRON.value
released = True
creation_quota = Quota(3600, 60, 60_000) # 60,000 per hour, sliding window of 60 seconds
default_priority = PriorityLevel.HIGH


@dataclass(frozen=True)
Expand Down
3 changes: 0 additions & 3 deletions src/sentry/monitors/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,6 @@
# current limit is 28 days
MAX_TIMEOUT = 40_320

# Format to use in the issue subtitle for the missed check-in timestamp
SUBTITLE_DATETIME_FORMAT = "%b %d, %I:%M %p %Z"

# maximum value for incident + recovery thresholds to be set
# affects the performance of recent check-ins query
# lowering this may invalidate monitors + block check-ins
Expand Down
54 changes: 7 additions & 47 deletions src/sentry/monitors/logic/mark_failed.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,8 @@
from django.db.models import Q

from sentry import features
from sentry.issues.grouptype import (
MonitorCheckInFailure,
MonitorCheckInMissed,
MonitorCheckInTimeout,
)
from sentry.issues.grouptype import MonitorIncidentType
from sentry.models.organization import Organization
from sentry.monitors.constants import SUBTITLE_DATETIME_FORMAT, TIMEOUT
from sentry.monitors.models import (
CheckInStatus,
MonitorCheckIn,
Expand Down Expand Up @@ -243,10 +238,8 @@ def create_issue_platform_occurrence(
monitor_env = failed_checkin.monitor_environment
current_timestamp = datetime.now(timezone.utc)

occurrence_data = get_occurrence_data(failed_checkin)

# Get last successful check-in to show in evidence display
last_successful_checkin_timestamp = "None"
last_successful_checkin_timestamp = "Never"
last_successful_checkin = monitor_env.get_last_successful_checkin()
if last_successful_checkin:
last_successful_checkin_timestamp = last_successful_checkin.date_added.isoformat()
Expand All @@ -257,11 +250,11 @@ def create_issue_platform_occurrence(
project_id=monitor_env.monitor.project_id,
event_id=uuid.uuid4().hex,
fingerprint=[incident.grouphash],
type=occurrence_data["group_type"],
type=MonitorIncidentType,
issue_title=f"Monitor failure: {monitor_env.monitor.name}",
subtitle=occurrence_data["subtitle"],
subtitle="Your monitor has reached its failure threshold.",
evidence_display=[
IssueEvidence(name="Failure reason", value=occurrence_data["reason"], important=True),
IssueEvidence(name="Failure reason", value="incident", important=True),
IssueEvidence(
name="Environment", value=monitor_env.get_environment().name, important=False
),
Expand All @@ -272,9 +265,9 @@ def create_issue_platform_occurrence(
),
],
evidence_data={},
culprit=occurrence_data["reason"],
culprit="incident",
detection_time=current_timestamp,
level=occurrence_data["level"],
level="error",
assignee=monitor_env.monitor.owner_actor,
)

Expand Down Expand Up @@ -324,36 +317,3 @@ def get_monitor_environment_context(monitor_environment: MonitorEnvironment):
"status": monitor_environment.get_status_display(),
"type": monitor_environment.monitor.get_type_display(),
}


def get_occurrence_data(checkin: MonitorCheckIn):
if checkin.status == CheckInStatus.MISSED:
expected_time = (
checkin.expected_time.astimezone(checkin.monitor.timezone).strftime(
SUBTITLE_DATETIME_FORMAT
)
if checkin.expected_time
else "the expected time"
)
return {
"group_type": MonitorCheckInMissed,
"level": "warning",
"reason": "missed_checkin",
"subtitle": f"No check-in reported on {expected_time}.",
}

if checkin.status == CheckInStatus.TIMEOUT:
duration = (checkin.monitor.config or {}).get("max_runtime") or TIMEOUT
return {
"group_type": MonitorCheckInTimeout,
"level": "error",
"reason": "duration",
"subtitle": f"Check-in exceeded maximum duration of {duration} minutes.",
}

return {
"group_type": MonitorCheckInFailure,
"level": "error",
"reason": "error",
"subtitle": "An error occurred during the latest check-in.",
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from sentry.digests.backends.redis import RedisBackend
from sentry.digests.notifications import event_to_record
from sentry.integrations.slack.message_builder.issues import get_tags
from sentry.issues.grouptype import MonitorCheckInFailure
from sentry.issues.grouptype import MonitorIncidentType
from sentry.issues.issue_occurrence import IssueEvidence, IssueOccurrence
from sentry.models.identity import Identity, IdentityStatus
from sentry.models.integrations.external_actor import ExternalActor
Expand Down Expand Up @@ -157,15 +157,15 @@ def test_crons_issue_alert_user_block(self):
IssueEvidence("Evidence 2", "Value 2", False),
IssueEvidence("Evidence 3", "Value 3", False),
],
MonitorCheckInFailure,
MonitorIncidentType,
datetime.now(UTC),
"info",
"/api/123",
)
occurrence.save()
event.occurrence = occurrence

event.group.type = MonitorCheckInFailure.type_id
event.group.type = MonitorIncidentType.type_id
notification = AlertRuleNotification(
Notification(event=event, rule=self.rule), ActionTargetType.MEMBER, self.user.id
)
Expand Down
4 changes: 2 additions & 2 deletions tests/sentry/integrations/slack/test_message_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from sentry.issues.grouptype import (
ErrorGroupType,
FeedbackGroup,
MonitorCheckInFailure,
MonitorIncidentType,
PerformanceP95EndpointRegressionGroupType,
ProfileFileIOGroupType,
)
Expand Down Expand Up @@ -1321,7 +1321,7 @@ def setUp(self):
type=PerformanceP95EndpointRegressionGroupType.type_id
)

self.cron_issue = self.create_group(type=MonitorCheckInFailure.type_id)
self.cron_issue = self.create_group(type=MonitorIncidentType.type_id)
self.feedback_issue = self.create_group(
type=FeedbackGroup.type_id, substatus=GroupSubStatus.NEW
)
Expand Down
4 changes: 2 additions & 2 deletions tests/sentry/issues/test_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
GroupCategory,
GroupType,
GroupTypeRegistry,
MonitorCheckInFailure,
MonitorIncidentType,
NoiseConfig,
)
from sentry.issues.ingest import (
Expand Down Expand Up @@ -248,7 +248,7 @@ def test_existing_group_different_category(self) -> None:

new_event = self.store_event(data={}, project_id=self.project.id)
new_occurrence = self.build_occurrence(
fingerprint=["some-fingerprint"], type=MonitorCheckInFailure.type_id
fingerprint=["some-fingerprint"], type=MonitorIncidentType.type_id
)
with mock.patch("sentry.issues.ingest.logger") as logger:
assert save_issue_from_occurrence(new_occurrence, new_event, None) is None
Expand Down
10 changes: 5 additions & 5 deletions tests/sentry/mail/test_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from sentry.api.serializers.models.userreport import UserReportWithGroupSerializer
from sentry.digests.notifications import build_digest, event_to_record
from sentry.event_manager import EventManager, get_event_type
from sentry.issues.grouptype import MonitorCheckInFailure
from sentry.issues.grouptype import MonitorIncidentType
from sentry.issues.issue_occurrence import IssueEvidence, IssueOccurrence
from sentry.mail import build_subject_prefix, mail_adapter
from sentry.models.activity import Activity
Expand Down Expand Up @@ -328,15 +328,15 @@ def test_simple_notification_generic(self):
IssueEvidence("Evidence 2", "Value 2", False),
IssueEvidence("Evidence 3", "Value 3", False),
],
MonitorCheckInFailure,
MonitorIncidentType,
timezone.now(),
"info",
"/api/123",
)
occurrence.save()
event.occurrence = occurrence

event.group.type = MonitorCheckInFailure.type_id
event.group.type = MonitorIncidentType.type_id

rule = Rule.objects.create(project=self.project, label="my rule")
ProjectOwnership.objects.create(project_id=self.project.id, fallthrough=True)
Expand Down Expand Up @@ -384,15 +384,15 @@ def test_simple_notification_generic_no_evidence(self):
"1234",
{"Test": 123},
[], # no evidence
MonitorCheckInFailure,
MonitorIncidentType,
timezone.now(),
"info",
"/api/123",
)
occurrence.save()
event.occurrence = occurrence

event.group.type = MonitorCheckInFailure.type_id
event.group.type = MonitorIncidentType.type_id

rule = Rule.objects.create(project=self.project, label="my rule")
ProjectOwnership.objects.create(project_id=self.project.id, fallthrough=True)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from sentry.issues.grouptype import (
ErrorGroupType,
FeedbackGroup,
MonitorCheckInFailure,
MonitorIncidentType,
PerformanceConsecutiveHTTPQueriesGroupType,
PerformanceP95EndpointRegressionGroupType,
ReplayDeadClickType,
Expand Down Expand Up @@ -114,7 +114,7 @@ def _create_groups_to_backfill(self, project: Project) -> None:
{
"status": GroupStatus.UNRESOLVED,
"substatus": GroupSubStatus.ESCALATING,
"type": MonitorCheckInFailure.type_id,
"type": MonitorIncidentType.type_id,
},
PriorityLevel.HIGH,
),
Expand Down Expand Up @@ -181,7 +181,7 @@ def _create_groups_to_backfill(self, project: Project) -> None:
(
"cron group with log level WARNING",
{
"type": MonitorCheckInFailure.type_id,
"type": MonitorIncidentType.type_id,
"level": logging.WARNING,
},
PriorityLevel.MEDIUM,
Expand All @@ -190,15 +190,15 @@ def _create_groups_to_backfill(self, project: Project) -> None:
"cron group with log level ERROR",
{
"substatus": GroupSubStatus.ONGOING,
"type": MonitorCheckInFailure.type_id,
"type": MonitorIncidentType.type_id,
"level": logging.ERROR,
},
PriorityLevel.HIGH,
),
(
"cron group with log level DEBUG",
{
"type": MonitorCheckInFailure.type_id,
"type": MonitorIncidentType.type_id,
"level": logging.DEBUG,
},
PriorityLevel.HIGH,
Expand Down

0 comments on commit 9d56889

Please sign in to comment.