Skip to content

Commit f643bcc

Browse files
authored
ref: No longer prune duplicated data before persisting for future refactorings (#10683)
1 parent 1dbceb1 commit f643bcc

File tree

6 files changed

+164
-84
lines changed

6 files changed

+164
-84
lines changed

src/sentry/event_manager.py

Lines changed: 101 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,11 @@
8686
ENABLE_RUST = os.environ.get("SENTRY_USE_RUST_NORMALIZER", "false").lower() in ("1", "true")
8787

8888

89+
def set_tag(data, key, value):
90+
data['tags'] = [(k, v) for k, v in data['tags'] if k != key]
91+
data['tags'].append((key, value))
92+
93+
8994
def get_event_metadata_compat(data, fallback_message):
9095
"""This is a fallback path to getting the event metadata. This is used
9196
by some code paths that could potentially deal with old sentry events that
@@ -412,6 +417,7 @@ def __init__(
412417
self._auth = auth
413418
self._key = key
414419
self._for_store = for_store
420+
self._normalized = False
415421

416422
def process_csp_report(self):
417423
"""Only called from the CSP report endpoint."""
@@ -464,6 +470,10 @@ def clean(d):
464470
self._data = data
465471

466472
def normalize(self):
473+
if self._normalized:
474+
raise RuntimeError('Already normalized')
475+
self._normalized = True
476+
467477
if ENABLE_RUST:
468478
from semaphore.processing import StoreNormalizer
469479
rust_normalizer = StoreNormalizer(
@@ -656,8 +666,6 @@ def stringify(f):
656666
data.setdefault('fingerprint', None)
657667
data.setdefault('logger', DEFAULT_LOGGER_NAME)
658668
data.setdefault('platform', None)
659-
data.setdefault('server_name', None)
660-
data.setdefault('site', None)
661669
data.setdefault('tags', [])
662670
data.setdefault('transaction', None)
663671

@@ -730,12 +738,18 @@ def stringify(f):
730738
# XXX: This will be trimmed again when inserted into tag values
731739
data['transaction'] = trim(data['transaction'], MAX_CULPRIT_LENGTH)
732740

741+
# Move some legacy data into tags
742+
site = data.pop('site', None)
743+
if site is not None:
744+
set_tag(data, 'site', site)
745+
server_name = data.pop('server_name', None)
746+
if server_name is not None:
747+
set_tag(data, 'server_name', server_name)
748+
733749
# Do not add errors unless there are for non store mode
734750
if not self._for_store and not data.get('errors'):
735751
self._data.pop('errors')
736752

737-
self._data = data
738-
739753
def should_filter(self):
740754
'''
741755
returns (result: bool, reason: string or None)
@@ -785,16 +799,14 @@ def get_data(self):
785799
return self._data
786800

787801
def _get_event_instance(self, project_id=None):
788-
data = self._data.copy()
789-
event_id = data.pop('event_id')
790-
platform = data.pop('platform', None)
802+
data = self._data
803+
event_id = data.get('event_id')
804+
platform = data.get('platform')
791805

792-
recorded_timestamp = data.pop('timestamp')
806+
recorded_timestamp = data.get('timestamp')
793807
date = datetime.fromtimestamp(recorded_timestamp)
794808
date = date.replace(tzinfo=timezone.utc)
795-
796-
# unused
797-
time_spent = data.pop('time_spent', None)
809+
time_spent = data.get('time_spent')
798810

799811
return Event(
800812
project_id=project_id or self._project.id,
@@ -805,11 +817,30 @@ def _get_event_instance(self, project_id=None):
805817
platform=platform
806818
)
807819

808-
def get_search_message(self, data, event_metadata=None, culprit=None):
820+
def get_culprit(self):
821+
"""Helper to calculate the default culprit"""
822+
return force_text(
823+
self._data.get('culprit') or
824+
self._data.get('transaction') or
825+
generate_culprit(self._data, platform=self._data['platform']) or
826+
''
827+
)
828+
829+
def get_event_type(self):
830+
"""Returns the event type."""
831+
return eventtypes.get(self._data.get('type', 'default'))(self._data)
832+
833+
def get_search_message(self, event_metadata=None, culprit=None):
809834
"""This generates the internal event.message attribute which is used
810835
for search purposes. It adds a bunch of data from the metadata and
811836
the culprit.
812837
"""
838+
if event_metadata is None:
839+
event_metadata = self.get_event_type().get_metadata()
840+
if culprit is None:
841+
culprit = self.get_culprit()
842+
843+
data = self._data
813844
message = ''
814845

815846
if 'logentry' in data:
@@ -828,7 +859,13 @@ def get_search_message(self, data, event_metadata=None, culprit=None):
828859

829860
return trim(message.strip(), settings.SENTRY_MAX_MESSAGE_LENGTH)
830861

831-
def save(self, project_id, raw=False):
862+
def save(self, project_id, raw=False, assume_normalized=False):
863+
# Normalize if needed
864+
if not self._normalized:
865+
if not assume_normalized:
866+
self.normalize()
867+
self._normalized = True
868+
832869
from sentry.tasks.post_process import index_event_tags
833870

834871
data = self._data
@@ -859,53 +896,50 @@ def save(self, project_id, raw=False):
859896
)
860897
return event
861898

862-
# pull out our top-level (non-data attr) kwargs
863-
level = data.pop('level')
864-
transaction_name = data.pop('transaction', None)
865-
culprit = data.pop('culprit', None)
866-
logger_name = data.pop('logger', None)
867-
server_name = data.pop('server_name', None)
868-
site = data.pop('site', None)
869-
checksum = data.pop('checksum', None)
870-
fingerprint = data.pop('fingerprint', None)
871-
release = data.pop('release', None)
872-
dist = data.pop('dist', None)
873-
environment = data.pop('environment', None)
874-
recorded_timestamp = data.get("timestamp")
875-
876-
# old events had a small chance of having a legacy message
877-
# attribute show up here. In all reality this is being coerced
878-
# into logentry for more than two years at this point (2018).
879-
data.pop('message', None)
880-
899+
# Pull out the culprit
900+
culprit = self.get_culprit()
901+
902+
# Pull the toplevel data we're interested in
903+
level = data.get('level')
904+
905+
# TODO(mitsuhiko): this code path should be gone by July 2018.
906+
# This is going to be fine because no code actually still depends
907+
# on integers here. When we need an integer it will be converted
908+
# into one later. Old workers used to send integers here.
909+
if level is not None and isinstance(level, six.integer_types):
910+
level = LOG_LEVELS[level]
911+
912+
transaction_name = data.get('transaction')
913+
logger_name = data.get('logger')
914+
checksum = data.get('checksum')
915+
fingerprint = data.get('fingerprint')
916+
release = data.get('release')
917+
dist = data.get('dist')
918+
environment = data.get('environment')
919+
recorded_timestamp = data.get('timestamp')
920+
921+
# We need to swap out the data with the one internal to the newly
922+
# created event object
881923
event = self._get_event_instance(project_id=project_id)
924+
self._data = data = event.data.data
925+
882926
event._project_cache = project
883927

884928
date = event.datetime
885929
platform = event.platform
886930
event_id = event.event_id
887-
data = event.data.data
888-
self._data = None
889-
890-
culprit = culprit or \
891-
transaction_name or \
892-
generate_culprit(data, platform=platform) or \
893-
''
894931

895-
culprit = force_text(culprit)
896932
if transaction_name:
897933
transaction_name = force_text(transaction_name)
898934

899-
# convert this to a dict to ensure we're only storing one value per key
900-
# as most parts of Sentry dont currently play well with multiple values
935+
# Some of the data that are toplevel attributes are duplicated
936+
# into tags (logger, level, environment, transaction). These are
937+
# different from legacy attributes which are normalized into tags
938+
# ahead of time (site, server_name).
901939
tags = dict(data.get('tags') or [])
902-
tags['level'] = LOG_LEVELS[level]
940+
tags['level'] = level
903941
if logger_name:
904942
tags['logger'] = logger_name
905-
if server_name:
906-
tags['server_name'] = server_name
907-
if site:
908-
tags['site'] = site
909943
if environment:
910944
tags['environment'] = trim(environment, MAX_TAG_VALUE_LENGTH)
911945
if transaction_name:
@@ -925,6 +959,9 @@ def save(self, project_id, raw=False):
925959

926960
if dist and release:
927961
dist = release.add_dist(dist, date)
962+
# dont allow a conflicting 'dist' tag
963+
if 'dist' in tags:
964+
del tags['dist']
928965
tags['sentry:dist'] = dist.name
929966
else:
930967
dist = None
@@ -973,42 +1010,35 @@ def save(self, project_id, raw=False):
9731010
else:
9741011
hashes = [md5_from_hash(h) for h in get_hashes_for_event(event)]
9751012

976-
event_type = eventtypes.get(data.get('type', 'default'))(data)
1013+
event_type = self.get_event_type()
9771014
event_metadata = event_type.get_metadata()
9781015

9791016
data['type'] = event_type.key
9801017
data['metadata'] = event_metadata
9811018

9821019
# index components into ``Event.message``
9831020
# See GH-3248
984-
event.message = self.get_search_message(data, event_metadata, culprit)
1021+
event.message = self.get_search_message(event_metadata, culprit)
1022+
received_timestamp = event.data.get('received') or float(event.datetime.strftime('%s'))
9851023

9861024
kwargs = {
9871025
'platform': platform,
988-
'message': event.message
1026+
'message': event.message,
1027+
'culprit': culprit,
1028+
'logger': logger_name,
1029+
'level': LOG_LEVELS_MAP.get(level),
1030+
'last_seen': date,
1031+
'first_seen': date,
1032+
'active_at': date,
1033+
'data': {
1034+
'last_received': received_timestamp,
1035+
'type': event_type.key,
1036+
# we cache the events metadata on the group to ensure its
1037+
# accessible in the stream
1038+
'metadata': event_metadata,
1039+
},
9891040
}
9901041

991-
received_timestamp = event.data.get('received') or float(event.datetime.strftime('%s'))
992-
kwargs.update(
993-
{
994-
'culprit': culprit,
995-
'logger': logger_name,
996-
'level': level,
997-
'last_seen': date,
998-
'first_seen': date,
999-
'active_at': date,
1000-
'data': {
1001-
'last_received': received_timestamp,
1002-
'type':
1003-
event_type.key,
1004-
# we cache the events metadata on the group to ensure its
1005-
# accessible in the stream
1006-
'metadata':
1007-
event_metadata,
1008-
},
1009-
}
1010-
)
1011-
10121042
if release:
10131043
kwargs['first_release'] = release
10141044

src/sentry/models/event.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -182,27 +182,38 @@ def get_tag(self, key):
182182
return v
183183
return None
184184

185+
@property
186+
def release(self):
187+
return self.get_tag('sentry:release')
188+
185189
@property
186190
def dist(self):
187191
return self.get_tag('sentry:dist')
188192

189193
def as_dict(self):
190194
# We use a OrderedDict to keep elements ordered for a potential JSON serializer
191195
data = OrderedDict()
192-
data['id'] = self.event_id
196+
data['event_id'] = self.event_id
193197
data['project'] = self.project_id
194-
data['release'] = self.get_tag('sentry:release')
198+
data['release'] = self.release
195199
data['dist'] = self.dist
196200
data['platform'] = self.platform
197-
data['culprit'] = self.group.culprit
198201
data['message'] = self.get_legacy_message()
199202
data['datetime'] = self.datetime
200203
data['time_spent'] = self.time_spent
201-
data['tags'] = self.get_tags()
204+
data['tags'] = [(k.split('sentry:', 1)[-1], v) for (k, v) in self.get_tags()]
202205
for k, v in sorted(six.iteritems(self.data)):
206+
if k in data:
207+
continue
203208
if k == 'sdk':
204209
v = {v_k: v_v for v_k, v_v in six.iteritems(v) if v_k != 'client_ip'}
205210
data[k] = v
211+
212+
# for a long time culprit was not persisted. In those cases put
213+
# the culprit in from the group.
214+
if data.get('culprit') is None:
215+
data['culprit'] = self.group.culprit
216+
206217
return data
207218

208219
@property

src/sentry/tasks/store.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -383,7 +383,7 @@ def save_event(cache_key=None, data=None, start_time=None, event_id=None,
383383

384384
try:
385385
manager = EventManager(data)
386-
event = manager.save(project_id)
386+
event = manager.save(project_id, assume_normalized=True)
387387

388388
# Always load attachments from the cache so we can later prune them.
389389
# Only save them if the event-attachments feature is active, though.

src/sentry/testutils/fixtures.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -499,6 +499,9 @@ def create_event(self, event_id=None, normalize=True, **kwargs):
499499
manager.normalize()
500500
kwargs['data'] = manager.get_data()
501501

502+
else:
503+
assert 'message' not in kwargs, 'do not pass message this way'
504+
502505
event = Event(event_id=event_id, **kwargs)
503506
EventMapping.objects.create(
504507
project_id=event.project.id,

0 commit comments

Comments
 (0)