Skip to content

Commit

Permalink
Implement search backend abstraction
Browse files Browse the repository at this point in the history
  • Loading branch information
dcramer committed Oct 20, 2013
1 parent cf8611f commit c765dee
Show file tree
Hide file tree
Showing 23 changed files with 1,909 additions and 241 deletions.
15 changes: 8 additions & 7 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,17 +62,19 @@


install_requires = [
'cssutils>0.9.9,<0.9.11',
'BeautifulSoup>=3.2.1,<3.3.0',
'django-celery>=3.0.11,<3.1.0',
'celery>=3.0.15,<3.1.0',
'django-crispy-forms>=1.2.3,<1.3.0',
'cssutils>=0.9.9,<0.10.0',
'Django>=1.5.4,<1.6',
'django-paging>=0.2.5,<0.3.0',
'django-celery>=3.0.11,<3.1.0',
'django-crispy-forms>=1.2.3,<1.3.0',
'django-paging>=0.2.4,<0.3.0',
'django-picklefield>=0.3.0,<0.4.0',
'django-social-auth>=0.7.28,<0.8.0',
'django-static-compiler>=0.3.0,<0.4.0',
'django-templatetag-sugar>=0.1.0,<0.2.0',
'gunicorn>=0.17.2,<0.18.0',
'httpagentparser>=1.2.1,<1.3.0',
'logan>=0.5.8.2,<0.6.0',
'nydus>=0.10.0,<0.11.0',
'Pygments>=1.6.0,<1.7.0',
Expand All @@ -82,10 +84,9 @@
'raven>=3.3.8',
'redis>=2.7.0,<2.9.0',
'simplejson>=3.1.0,<3.4.0',
'South>=0.8.0,<0.9.0',
'httpagentparser>=1.2.1,<1.3.0',
'django-social-auth>=0.7.28,<0.8.0',
'setproctitle>=1.1.7,<1.2.0',
'South>=0.8.0,<0.9.0',
'urllib3>=1.7.1,<1.8.0',
]

postgres_requires = [
Expand Down
5 changes: 4 additions & 1 deletion src/sentry/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,14 @@ class State(local):
request = None
data = {}

env = State()


def get_instance(path, options):
cls = import_string(path)
return cls(**options)


buffer = get_instance(settings.SENTRY_BUFFER, settings.SENTRY_BUFFER_OPTIONS)
quotas = get_instance(settings.SENTRY_QUOTAS, settings.SENTRY_QUOTA_OPTIONS)
env = State()
search = get_instance(settings.SENTRY_SEARCH, settings.SENTRY_SEARCH_OPTIONS)
18 changes: 14 additions & 4 deletions src/sentry/conf/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@
'kombu.transport.django',
'raven.contrib.django.raven_compat',
'sentry',
'sentry.search',
'sentry.plugins.sentry_interface_types',
'sentry.plugins.sentry_mail',
'sentry.plugins.sentry_urls',
Expand Down Expand Up @@ -494,10 +495,6 @@
# manually.
SENTRY_ALLOW_REGISTRATION = True

# Instructs Sentry to utilize it's internal search indexer on all incoming
# events..
SENTRY_USE_SEARCH = True

# Enable trend results. These can be expensive and are calculated in real-time.
# When disabled they will be replaced w/ a default priority sort.
SENTRY_USE_TRENDING = True
Expand All @@ -517,11 +514,24 @@

SENTRY_QUOTAS = 'sentry.quotas.Quota'
SENTRY_QUOTA_OPTIONS = {}

# The default value for project-level quotas
SENTRY_DEFAULT_MAX_EVENTS_PER_MINUTE = '90%'

# The maximum number of events per minute the system should accept.
SENTRY_SYSTEM_MAX_EVENTS_PER_MINUTE = 0

# Search backend
SENTRY_SEARCH = 'sentry.search.django.DjangoSearchBackend'
SENTRY_SEARCH_OPTIONS = {}
# SENTRY_SEARCH_OPTIONS = {
# 'urls': ['http://localhost:9200/'],
# 'timeout': 5,
# }

# Enable search within the frontend
SENTRY_USE_SEARCH = True

SENTRY_RAVEN_JS_URL = 'd3nslu0hdya83q.cloudfront.net/dist/1.0/raven.min.js'

# URI Prefixes for generating DSN URLs
Expand Down
133 changes: 0 additions & 133 deletions src/sentry/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,9 @@

from __future__ import with_statement

from collections import defaultdict
import datetime
import hashlib
import itertools
import logging
import re
import time
import warnings
import uuid
Expand All @@ -26,7 +23,6 @@
from django.db.models import Sum
from django.utils import timezone
from django.utils.datastructures import SortedDict
from django.utils.encoding import force_unicode

from raven.utils.encoding import to_string
from sentry import app
Expand Down Expand Up @@ -949,135 +945,6 @@ def clear_cache(self, **kwargs):
self.__metadata = {}


class SearchDocumentManager(BaseManager):
# Words which should not be indexed
STOP_WORDS = set(['the', 'of', 'to', 'and', 'a', 'in', 'is', 'it', 'you', 'that'])

# Do not index any words shorter than this
MIN_WORD_LENGTH = 3

# Consider these characters to be punctuation (they will be replaced with spaces prior to word extraction)
PUNCTUATION_CHARS = re.compile('[%s]' % re.escape(".,;:!?@$%^&*()-<>[]{}\\|/`~'\""))

def _tokenize(self, text):
"""
Given a string, returns a list of tokens.
"""
if not text:
return []

text = self.PUNCTUATION_CHARS.sub(' ', text)

words = [t[:128].lower() for t in text.split() if len(t) >= self.MIN_WORD_LENGTH and t.lower() not in self.STOP_WORDS]

return words

def search(self, project, query, sort_by='score', offset=0, limit=100):
tokens = self._tokenize(query)

if sort_by == 'score':
order_by = 'SUM(st.times_seen) / sd.total_events DESC'
elif sort_by == 'new':
order_by = 'sd.date_added DESC'
elif sort_by == 'date':
order_by = 'sd.date_changed DESC'
else:
raise ValueError('sort_by: %r' % sort_by)

if tokens:
token_sql = ' st.token IN (%s) AND ' % \
', '.join('%s' for i in range(len(tokens)))
else:
token_sql = ' '

sql = """
SELECT sd.*,
SUM(st.times_seen) / sd.total_events as score
FROM sentry_searchdocument as sd
INNER JOIN sentry_searchtoken as st
ON st.document_id = sd.id
WHERE %s
sd.project_id = %s
GROUP BY sd.id, sd.group_id, sd.total_events, sd.date_changed, sd.date_added, sd.project_id, sd.status
ORDER BY %s
LIMIT %d OFFSET %d
""" % (
token_sql,
project.id,
order_by,
limit,
offset,
)
params = tokens

return self.raw(sql, params)

def index(self, event):
from sentry.models import SearchToken

group = event.group
document, created = self.get_or_create(
project=event.project,
group=group,
defaults={
'status': group.status,
'total_events': 1,
'date_added': group.first_seen,
'date_changed': group.last_seen,
}
)
if not created:
app.buffer.incr(self.model, {
'total_events': 1,
}, {
'id': document.id,
}, {
'date_changed': group.last_seen,
'status': group.status,
})

document.total_events += 1
document.date_changed = group.last_seen
document.status = group.status

context = defaultdict(list)
for interface in event.interfaces.itervalues():
for k, v in interface.get_search_context(event).iteritems():
context[k].extend(v)

context['text'].extend([
event.message,
event.logger,
event.server_name,
event.culprit,
])

token_counts = defaultdict(lambda: defaultdict(int))
for field, values in context.iteritems():
field = field.lower()
if field == 'text':
# we only tokenize the base text field
values = itertools.chain(*[self._tokenize(force_unicode(v)) for v in values])
else:
values = [v.lower() for v in values]
for value in values:
if not value:
continue
token_counts[field][value] += 1

for field, tokens in token_counts.iteritems():
for token, count in tokens.iteritems():
app.buffer.incr(SearchToken, {
'times_seen': count,
}, {
'document': document,
'token': token,
'field': field,
})

return document


class TagKeyManager(BaseManager):
def _get_cache_key(self, project_id):
return 'filterkey:all:%s' % project_id
Expand Down
60 changes: 8 additions & 52 deletions src/sentry/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,16 @@
MEMBER_OWNER, MEMBER_USER, PLATFORM_TITLES, PLATFORM_LIST,
STATUS_UNRESOLVED, STATUS_RESOLVED, STATUS_VISIBLE, STATUS_HIDDEN,
MINUTE_NORMALIZATION, STATUS_MUTED, RESERVED_TEAM_SLUGS,
LOG_LEVELS, MAX_CULPRIT_LENGTH, MAX_TAG_KEY_LENGTH, MAX_TAG_VALUE_LENGTH)
LOG_LEVELS, MAX_CULPRIT_LENGTH, MAX_TAG_KEY_LENGTH, MAX_TAG_VALUE_LENGTH
)
from sentry.db.models import (
Model, GzippedDictField, BoundedIntegerField, BoundedPositiveIntegerField,
update, sane_repr)
update, sane_repr
)
from sentry.manager import (
GroupManager, ProjectManager,
MetaManager, InstanceMetaManager, SearchDocumentManager, BaseManager,
UserOptionManager, TagKeyManager, TeamManager, UserManager)
GroupManager, ProjectManager, MetaManager, InstanceMetaManager, BaseManager,
UserOptionManager, TagKeyManager, TeamManager, UserManager
)
from sentry.signals import buffer_incr_complete, regression_signal
from sentry.utils.cache import memoize
from sentry.utils.db import has_trending
Expand All @@ -53,7 +55,7 @@
from sentry.utils.safe import safe_execute
from sentry.utils.strings import truncatechars, strip

__all__ = ('Event', 'Group', 'Project', 'SearchDocument')
__all__ = ('Event', 'Group', 'Project')


def slugify_instance(inst, label, reserved=(), **kwargs):
Expand Down Expand Up @@ -975,36 +977,6 @@ class Meta:
__repr__ = sane_repr('project_id', 'date')


class SearchDocument(Model):
project = models.ForeignKey(Project)
group = models.ForeignKey(Group)
total_events = BoundedPositiveIntegerField(default=1)
status = BoundedPositiveIntegerField(default=0)
date_added = models.DateTimeField(default=timezone.now)
date_changed = models.DateTimeField(default=timezone.now)

objects = SearchDocumentManager()

class Meta:
unique_together = (('project', 'group'),)

__repr__ = sane_repr('project_id', 'group_id')


class SearchToken(Model):
document = models.ForeignKey(SearchDocument, related_name="token_set")
field = models.CharField(max_length=64, default='text')
token = models.CharField(max_length=128)
times_seen = BoundedPositiveIntegerField(default=1)

objects = BaseManager()

class Meta:
unique_together = (('document', 'field', 'token'),)

__repr__ = sane_repr('document_id', 'field', 'token')


class UserOption(Model):
"""
User options apply only to a user, and optionally a project.
Expand Down Expand Up @@ -1349,16 +1321,6 @@ def create_team_member_for_owner(instance, created, **kwargs):
)


def update_document(instance, created, **kwargs):
if created:
return

SearchDocument.objects.filter(
project=instance.project,
group=instance,
).update(status=instance.status)


def remove_key_for_team_member(instance, **kwargs):
for project in instance.team.project_set.all():
ProjectKey.objects.filter(
Expand Down Expand Up @@ -1447,12 +1409,6 @@ def on_alert_creation(instance, **kwargs):
dispatch_uid="create_team_member_for_owner",
weak=False,
)
post_save.connect(
update_document,
sender=Group,
dispatch_uid="update_document",
weak=False,
)
pre_delete.connect(
remove_key_for_team_member,
sender=TeamMember,
Expand Down
9 changes: 9 additions & 0 deletions src/sentry/search/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"""
sentry.search
~~~~~~~~~~~~~
:copyright: (c) 2010-2013 by the Sentry Team, see AUTHORS for more details.
:license: BSD, see LICENSE for more details.
"""

from .base import * # NOQA
23 changes: 23 additions & 0 deletions src/sentry/search/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
"""
sentry.search.base
~~~~~~~~~~~~~~~~~~
:copyright: (c) 2010-2013 by the Sentry Team, see AUTHORS for more details.
:license: BSD, see LICENSE for more details.
"""

from __future__ import absolute_import


class SearchBackend(object):
def __init__(self, **options):
pass

def index(self, group, event):
pass

def remove(self, group):
pass

def query(self, **kwargs):
raise NotImplementedError
9 changes: 9 additions & 0 deletions src/sentry/search/django/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"""
sentry.search.django
~~~~~~~~~~~~~~~~~~~~
:copyright: (c) 2010-2013 by the Sentry Team, see AUTHORS for more details.
:license: BSD, see LICENSE for more details.
"""

from .backend import * # NOQA
Loading

1 comment on commit c765dee

@dcramer
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesnt actually implement the query interface for Solr, so this should still be considered a private feature.

Landing it so we can begin the final release phase on getsentry.com, and during that we'll smooth transition and finish the tooling around non-Django backends.

Please sign in to comment.