Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
172 changes: 101 additions & 71 deletions src/sentry/event_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,11 @@
ENABLE_RUST = os.environ.get("SENTRY_USE_RUST_NORMALIZER", "false").lower() in ("1", "true")


def set_tag(data, key, value):
data['tags'] = [(k, v) for k, v in data['tags'] if k != key]
data['tags'].append((key, value))


def get_event_metadata_compat(data, fallback_message):
"""This is a fallback path to getting the event metadata. This is used
by some code paths that could potentially deal with old sentry events that
Expand Down Expand Up @@ -410,6 +415,7 @@ def __init__(
self._auth = auth
self._key = key
self._for_store = for_store
self._normalized = False

def process_csp_report(self):
"""Only called from the CSP report endpoint."""
Expand Down Expand Up @@ -462,6 +468,10 @@ def clean(d):
self._data = data

def normalize(self):
if self._normalized:
raise RuntimeError('Already normalized')
self._normalized = True

if ENABLE_RUST:
from semaphore.processing import StoreNormalizer
rust_normalizer = StoreNormalizer(
Expand Down Expand Up @@ -654,8 +664,6 @@ def stringify(f):
data.setdefault('fingerprint', None)
data.setdefault('logger', DEFAULT_LOGGER_NAME)
data.setdefault('platform', None)
data.setdefault('server_name', None)
data.setdefault('site', None)
data.setdefault('tags', [])
data.setdefault('transaction', None)

Expand Down Expand Up @@ -716,12 +724,18 @@ def stringify(f):
# XXX: This will be trimmed again when inserted into tag values
data['transaction'] = trim(data['transaction'], MAX_CULPRIT_LENGTH)

# Move some legacy data into tags
site = data.pop('site', None)
if site is not None:
set_tag(data, 'site', site)
server_name = data.pop('server_name', None)
if server_name is not None:
set_tag(data, 'server_name', server_name)

# Do not add errors unless there are for non store mode
if not self._for_store and not data.get('errors'):
self._data.pop('errors')

self._data = data

def should_filter(self):
'''
returns (result: bool, reason: string or None)
Expand Down Expand Up @@ -771,16 +785,14 @@ def get_data(self):
return self._data

def _get_event_instance(self, project_id=None):
data = self._data.copy()
event_id = data.pop('event_id')
platform = data.pop('platform', None)
data = self._data
event_id = data.get('event_id')
platform = data.get('platform')

recorded_timestamp = data.pop('timestamp')
recorded_timestamp = data.get('timestamp')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it matter that the event_id and timestamp getters are becoming safe? They would throw a KeyError before, and I'm not sure if that would catch anything of value before it caused problems.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no access to these attributes that would put stuff in.

date = datetime.fromtimestamp(recorded_timestamp)
date = date.replace(tzinfo=timezone.utc)

# unused
time_spent = data.pop('time_spent', None)
time_spent = data.get('time_spent')

return Event(
project_id=project_id or self._project.id,
Expand All @@ -791,11 +803,30 @@ def _get_event_instance(self, project_id=None):
platform=platform
)

def get_search_message(self, data, event_metadata=None, culprit=None):
def get_culprit(self):
"""Helper to calculate the default culprit"""
return force_text(
self._data.get('culprit') or
self._data.get('transaction') or
generate_culprit(self._data, platform=self._data['platform']) or
''
)

def get_event_type(self):
"""Returns the event type."""
return eventtypes.get(self._data.get('type', 'default'))(self._data)

def get_search_message(self, event_metadata=None, culprit=None):
"""This generates the internal event.message attribute which is used
for search purposes. It adds a bunch of data from the metadata and
the culprit.
"""
if event_metadata is None:
event_metadata = self.get_event_type().get_metadata()
if culprit is None:
culprit = self.get_culprit()

data = self._data
message = ''

if 'logentry' in data:
Expand All @@ -814,7 +845,13 @@ def get_search_message(self, data, event_metadata=None, culprit=None):

return trim(message.strip(), settings.SENTRY_MAX_MESSAGE_LENGTH)

def save(self, project_id, raw=False):
def save(self, project_id, raw=False, assume_normalized=False):
# Normalize if needed
if not self._normalized:
if not assume_normalized:
self.normalize()
self._normalized = True

from sentry.tasks.post_process import index_event_tags

data = self._data
Expand Down Expand Up @@ -845,53 +882,50 @@ def save(self, project_id, raw=False):
)
return event

# pull out our top-level (non-data attr) kwargs
level = data.pop('level')
transaction_name = data.pop('transaction', None)
culprit = data.pop('culprit', None)
logger_name = data.pop('logger', None)
server_name = data.pop('server_name', None)
site = data.pop('site', None)
checksum = data.pop('checksum', None)
fingerprint = data.pop('fingerprint', None)
release = data.pop('release', None)
dist = data.pop('dist', None)
environment = data.pop('environment', None)
recorded_timestamp = data.get("timestamp")

# old events had a small chance of having a legacy message
# attribute show up here. In all reality this is being coerced
# into logentry for more than two years at this point (2018).
data.pop('message', None)

# Pull out the culprit
culprit = self.get_culprit()

# Pull the toplevel data we're interested in
level = data.get('level')

# TODO(mitsuhiko): this code path should be gone by July 2018.
# This is going to be fine because no code actually still depends
# on integers here. When we need an integer it will be converted
# into one later. Old workers used to send integers here.
if level is not None and isinstance(level, six.integer_types):
level = LOG_LEVELS[level]

transaction_name = data.get('transaction')
logger_name = data.get('logger')
checksum = data.get('checksum')
fingerprint = data.get('fingerprint')
release = data.get('release')
dist = data.get('dist')
environment = data.get('environment')
recorded_timestamp = data.get('timestamp')

# We need to swap out the data with the one internal to the newly
# created event object
event = self._get_event_instance(project_id=project_id)
self._data = data = event.data.data

event._project_cache = project

date = event.datetime
platform = event.platform
event_id = event.event_id
data = event.data.data
self._data = None

culprit = culprit or \
transaction_name or \
generate_culprit(data, platform=platform) or \
''

culprit = force_text(culprit)
if transaction_name:
transaction_name = force_text(transaction_name)

# convert this to a dict to ensure we're only storing one value per key
# as most parts of Sentry dont currently play well with multiple values
# Some of the data that are toplevel attributes are duplicated
# into tags (logger, level, environment, transaction). These are
# different from legacy attributes which are normalized into tags
# ahead of time (site, server_name).
tags = dict(data.get('tags') or [])
tags['level'] = LOG_LEVELS[level]
tags['level'] = level
if logger_name:
tags['logger'] = logger_name
if server_name:
tags['server_name'] = server_name
if site:
tags['site'] = site
if environment:
tags['environment'] = trim(environment, MAX_TAG_VALUE_LENGTH)
if transaction_name:
Expand All @@ -911,6 +945,9 @@ def save(self, project_id, raw=False):

if dist and release:
dist = release.add_dist(dist, date)
# dont allow a conflicting 'dist' tag
if 'dist' in tags:
del tags['dist']
tags['sentry:dist'] = dist.name
else:
dist = None
Expand Down Expand Up @@ -959,42 +996,35 @@ def save(self, project_id, raw=False):
else:
hashes = [md5_from_hash(h) for h in get_hashes_for_event(event)]

event_type = eventtypes.get(data.get('type', 'default'))(data)
event_type = self.get_event_type()
event_metadata = event_type.get_metadata()

data['type'] = event_type.key
data['metadata'] = event_metadata

# index components into ``Event.message``
# See GH-3248
event.message = self.get_search_message(data, event_metadata, culprit)
event.message = self.get_search_message(event_metadata, culprit)
received_timestamp = event.data.get('received') or float(event.datetime.strftime('%s'))

kwargs = {
'platform': platform,
'message': event.message
'message': event.message,
'culprit': culprit,
'logger': logger_name,
'level': LOG_LEVELS_MAP.get(level),
'last_seen': date,
'first_seen': date,
'active_at': date,
'data': {
'last_received': received_timestamp,
'type': event_type.key,
# we cache the events metadata on the group to ensure its
# accessible in the stream
'metadata': event_metadata,
},
}

received_timestamp = event.data.get('received') or float(event.datetime.strftime('%s'))
kwargs.update(
{
'culprit': culprit,
'logger': logger_name,
'level': level,
'last_seen': date,
'first_seen': date,
'active_at': date,
'data': {
'last_received': received_timestamp,
'type':
event_type.key,
# we cache the events metadata on the group to ensure its
# accessible in the stream
'metadata':
event_metadata,
},
}
)

if release:
kwargs['first_release'] = release

Expand Down
19 changes: 15 additions & 4 deletions src/sentry/models/event.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,27 +182,38 @@ def get_tag(self, key):
return v
return None

@property
def release(self):
return self.get_tag('sentry:release')

@property
def dist(self):
return self.get_tag('sentry:dist')

def as_dict(self):
# We use a OrderedDict to keep elements ordered for a potential JSON serializer
data = OrderedDict()
data['id'] = self.event_id
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't fully understand the meaning/repercussions of changing this (and the test), can you explain?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code is only used in the JSON export. Neither UI nor plugins depend on this. The idea here is to make that output compatible to ingestion, so we can easily take such a payload and process it again, etc

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, got it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you @mitsuhiko @jan-auer for changing this. It definitely annoyed me before that we were using id instead of event_id in this serialization but I had just assumed that changing it would blow up everything.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Etching ever closer to being able to re-ingest off to_json :)

data['event_id'] = self.event_id
data['project'] = self.project_id
data['release'] = self.get_tag('sentry:release')
data['release'] = self.release
data['dist'] = self.dist
data['platform'] = self.platform
data['culprit'] = self.group.culprit
data['message'] = self.get_legacy_message()
data['datetime'] = self.datetime
data['time_spent'] = self.time_spent
data['tags'] = self.get_tags()
data['tags'] = [(k.split('sentry:', 1)[-1], v) for (k, v) in self.get_tags()]
for k, v in sorted(six.iteritems(self.data)):
if k in data:
continue
if k == 'sdk':
v = {v_k: v_v for v_k, v_v in six.iteritems(v) if v_k != 'client_ip'}
data[k] = v

# for a long time culprit was not persisted. In those cases put
# the culprit in from the group.
if data.get('culprit') is None:
data['culprit'] = self.group.culprit

return data

@property
Expand Down
2 changes: 1 addition & 1 deletion src/sentry/tasks/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,7 @@ def save_event(cache_key=None, data=None, start_time=None, event_id=None,

try:
manager = EventManager(data)
event = manager.save(project_id)
event = manager.save(project_id, assume_normalized=True)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is becoming slightly dangerous now because nothing is asserting that only normalized data goes in the queue and all tests will normalize with this. If there is a bug, we can only find it in production.

This is not a problem of your change, though it is uncovered now. I just have no idea how to fix this.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is already the case.


# Always load attachments from the cache so we can later prune them.
# Only save them if the event-attachments feature is active, though.
Expand Down
3 changes: 3 additions & 0 deletions src/sentry/testutils/fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,6 +499,9 @@ def create_event(self, event_id=None, normalize=True, **kwargs):
manager.normalize()
kwargs['data'] = manager.get_data()

else:
assert 'message' not in kwargs, 'do not pass message this way'

event = Event(event_id=event_id, **kwargs)
EventMapping.objects.create(
project_id=event.project.id,
Expand Down
Loading