Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP

Loading…

Improve cleanup tasks (remove most filters, optimize deletion patterns) #731

Closed
wants to merge 1 commit into from

1 participant

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
This page is out of date. Refresh to see the latest.
View
17 src/sentry/management/commands/cleanup.py
@@ -14,23 +14,16 @@ class Command(BaseCommand):
option_list = BaseCommand.option_list + (
make_option('--days', default='30', type=int, help='Numbers of days to truncate on.'),
- make_option('--logger', help='Limit truncation to only entries from logger.'),
- make_option('--site', help='Limit truncation to only entries from site.'),
- make_option('--server', help='Limit truncation to only entries from server.'),
- make_option('--level', help='Limit truncation to only entries greater than or equal to level (e.g. DEBUG).'),
make_option('--project', type=int, help='Limit truncation to only entries from project.'),
- make_option('--resolved', dest='resolved', action='store_true', help='Limit truncation to only entries that are resolved.'),
- make_option('--unresolved', dest='resolved', action='store_false', help='Limit truncation to only entries that are unresolved.'),
)
def handle(self, **options):
import logging
from sentry.tasks.cleanup import cleanup
- level = options['level']
+ if options['verbosity'] > 1:
+ logger = cleanup.get_logger()
+ logger.setLevel(logging.DEBUG)
+ logger.addHandler(logging.StreamHandler())
- if level is not None and not str(level).isdigit():
- options['level'] = getattr(logging, level.upper())
-
- cleanup(days=options['days'], logger=options['logger'], site=options['site'], server=options['server'],
- level=options['level'], project=options['project'], resolved=options['resolved'])
+ cleanup(days=options['days'], project=options['project'])
View
154 src/sentry/tasks/cleanup.py
@@ -10,143 +10,73 @@
@task(ignore_result=True)
-def cleanup(days=30, logger=None, site=None, server=None, level=None,
- project=None, resolved=None, **kwargs):
+def cleanup(days=30, project=None, **kwargs):
"""
Deletes a portion of the trailing data in Sentry based on
their creation dates. For example, if ``days`` is 30, this
would attempt to clean up all data thats older than 30 days.
- :param logger: limit all deletion scopes to messages from the
- specified logger.
- :param site: limit the message deletion scope to the specified
- site.
- :param server: limit the message deletion scope to the specified
- server.
- :param level: limit all deletion scopes to messages that are greater
- than or equal to level.
:param project: limit all deletion scopes to messages that are part
of the given project
- :param resolved: limit all deletion scopes to messages that are resolved.
"""
import datetime
from django.utils import timezone
- from sentry.models import Group, Event, MessageCountByMinute, \
- MessageFilterValue, FilterKey, FilterValue, SearchDocument, ProjectCountByMinute
- from sentry.utils.query import RangeQuerySetWrapper, SkinnyQuerySet
- log = cleanup.get_logger()
-
- def cleanup_groups(iterable):
- for obj in iterable:
- log.info("Removing all matching <SearchDocument: group=%s>", obj.pk)
- SearchDocument.objects.filter(group=obj).delete()
- log.info("Removing <%s: id=%s>", obj.__class__.__name__, obj.pk)
- obj.delete()
+ from sentry.models import (Group, Event, MessageCountByMinute,
+ MessageFilterValue, FilterKey, FilterValue, ProjectCountByMinute,
+ SearchDocument)
+ from sentry.utils.query import RangeQuerySetWrapper
- # TODO: we should collect which messages above were deleted
- # and potentially just send out post_delete signals where
- # GroupedMessage can update itself accordingly
- ts = timezone.now() - datetime.timedelta(days=days)
+ GENERIC_DELETES = (
+ (SearchDocument, 'date_changed'),
+ (MessageCountByMinute, 'date'),
+ (ProjectCountByMinute, 'date'),
+ (Event, 'datetime'),
+ (Group, 'last_seen'),
+ (MessageFilterValue, 'last_seen'),
+ )
- # Message
- qs = SkinnyQuerySet(Event).filter(datetime__lte=ts)
- if logger:
- qs = qs.filter(logger=logger)
- if site:
- qs = qs.filter(site=site)
- if server:
- qs = qs.filter(server_name=server)
- if level:
- qs = qs.filter(level__gte=level)
- if project:
- qs = qs.filter(project=project)
- if resolved is True:
- qs = qs.filter(group__status=1)
- elif resolved is False:
- qs = qs.filter(group__status=0)
+ log = cleanup.get_logger()
- groups_to_check = set()
- if resolved is None:
- for obj in RangeQuerySetWrapper(qs):
- log.info("Removing <%s: id=%s>", obj.__class__.__name__, obj.pk)
- obj.delete()
- groups_to_check.add(obj.group_id)
+ ts = timezone.now() - datetime.timedelta(days=days)
- if not (server or site):
- # MessageCountByMinute
- qs = SkinnyQuerySet(MessageCountByMinute).filter(date__lte=ts)
- if logger:
- qs = qs.filter(group__logger=logger)
- if level:
- qs = qs.filter(group__level__gte=level)
+ # Remove types which can easily be bound to project + date
+ for model, date_col in GENERIC_DELETES:
+ log.info("Removing %r for days=%s project=%r" % (model, days, project))
+ qs = model.objects.filter(**{'%s__lte' % (date_col,): ts})
if project:
qs = qs.filter(project=project)
- if resolved is True:
- qs = qs.filter(group__status=1)
- elif resolved is False:
- qs = qs.filter(group__status=0)
-
- for obj in RangeQuerySetWrapper(qs):
- log.info("Removing <%s: id=%s>", obj.__class__.__name__, obj.pk)
- obj.delete()
+ qs.delete()
- # Group
- qs = SkinnyQuerySet(Group).filter(last_seen__lte=ts)
- if logger:
- qs = qs.filter(logger=logger)
- if level:
- qs = qs.filter(level__gte=level)
+ # We'll need this to confirm deletion of FilterKey and Filtervalue objects.
+ if not project:
+ mqs = MessageFilterValue.objects.all()
if project:
- qs = qs.filter(project=project)
- if resolved is True:
- qs = qs.filter(status=1)
- elif resolved is False:
- qs = qs.filter(status=0)
-
- cleanup_groups(RangeQuerySetWrapper(qs))
+ mqs = mqs.filter(project=project)
- # Project counts
- # TODO: these dont handle filters
- qs = SkinnyQuerySet(ProjectCountByMinute).filter(date__lte=ts)
- if project:
- qs = qs.filter(project=project)
-
- for obj in RangeQuerySetWrapper(qs):
- log.info("Removing <%s: id=%s>", obj.__class__.__name__, obj.pk)
- obj.delete()
-
- # Filters
+ # FilterKey
+ log.info("Removing %r for days=%s project=%r" % (FilterKey, days, project))
qs = FilterKey.objects.all()
if project:
qs = qs.filter(project=project)
+ qs.delete()
+ else:
+ for obj in RangeQuerySetWrapper(qs):
+ if not mqs.filter(key=obj.key).exists():
+ log.info("Removing filters for unused filter %s=*", obj.key,)
+ qs.filter(key=obj.key).delete()
+ obj.delete()
- mqs = MessageFilterValue.objects.all()
- if project:
- mqs = mqs.filter(project=project)
-
- for obj in RangeQuerySetWrapper(qs):
- if not mqs.filter(key=obj.key).exists():
- log.info("Removing filters for unused filter %s=*", obj.key,)
- qs.filter(key=obj.key).delete()
- obj.delete()
-
+ # FilterValue
+ log.info("Removing %r for days=%s project=%r" % (FilterValue, days, project))
qs = FilterValue.objects.all()
if project:
qs = qs.filter(project=project)
-
- for obj in RangeQuerySetWrapper(qs):
- if not mqs.filter(key=obj.key, value=obj.value).exists():
- log.info("Removing filters for unused filter %s=%s", obj.key, obj.value)
- qs.filter(key=obj.key, value=obj.value).delete()
- obj.delete()
-
- # attempt to cleanup any groups that may now be empty
- groups_to_delete = []
- for group_id in groups_to_check:
- if not Event.objects.filter(group=group_id).exists():
- groups_to_delete.append(group_id)
-
- if groups_to_delete:
- cleanup_groups(SkinnyQuerySet(Group).filter(pk__in=groups_to_delete))
+ qs.delete()
+ else:
+ for obj in RangeQuerySetWrapper(qs):
+ if not mqs.filter(key=obj.key, value=obj.value).exists():
+ log.info("Removing filters for unused filter %s=%s", obj.key, obj.value)
+ qs.filter(key=obj.key, value=obj.value).delete()
+ obj.delete()
View
105 src/sentry/utils/query.py
@@ -6,8 +6,6 @@
:license: BSD, see LICENSE for more details.
"""
-from django.db.models import Min, Max
-from django.db.models.fields import AutoField, IntegerField
from django.db.models.query import QuerySet
@@ -44,12 +42,14 @@ def list(self):
class RangeQuerySetWrapper(object):
"""
- Iterates through a result set using MIN/MAX on primary key and stepping through.
+ Iterates through a queryset by chunking results by ``step`` and using GREATER THAN
+ and LESS THAN queries on the primary key.
Very efficient, but ORDER BY statements will not work.
"""
- def __init__(self, queryset, step=10000, limit=None, min_id=None, max_id=None, sorted=False):
+ def __init__(self, queryset, step=1000, limit=None, min_id=None,
+ order_by='pk'):
# Support for slicing
if queryset.query.low_mark == 0 and not\
(queryset.query.order_by or queryset.query.extra_order_by):
@@ -61,46 +61,67 @@ def __init__(self, queryset, step=10000, limit=None, min_id=None, max_id=None, s
self.limit = limit
if limit:
- self.step = min(limit, step)
+ self.step = min(limit, abs(step))
+ self.desc = step < 0
else:
- self.step = step
+ self.step = abs(step)
+ self.desc = step < 0
self.queryset = queryset
- self.min_id, self.max_id = min_id, max_id
- self.sorted = sorted
+ self.min_value = min_id
+ self.order_by = order_by
def __iter__(self):
- pk = self.queryset.model._meta.pk
- if not isinstance(pk, (IntegerField, AutoField)):
- raise NotImplementedError
+ max_value = None
+ if self.min_value is not None:
+ cur_value = self.min_value
else:
- if self.min_id is not None and self.max_id is not None:
- at, max_id = self.min_id, self.max_id
- else:
- at = self.queryset.aggregate(Min('pk'), Max('pk'))
- max_id, at = at['pk__max'], at['pk__min']
- if self.min_id:
- at = self.min_id
- if self.max_id:
- max_id = self.max_id
-
- if not (at and max_id):
- return
-
- num = 0
- limit = self.limit or max_id
-
- while at <= max_id and (not self.limit or num < self.limit):
- results = self.queryset.filter(
- id__gte=at,
- id__lte=min(at + self.step - 1, max_id),
- )
- if self.sorted:
- results = results.order_by('id')
- results = results.iterator()
-
- for result in results:
- yield result
- num += 1
- if num >= limit:
- break
- at += self.step
+ cur_value = None
+
+ num = 0
+ limit = self.limit
+
+ queryset = self.queryset
+ if max_value:
+ queryset = queryset.filter(**{'%s__lte' % self.order_by: max_value})
+ # Adjust the sort order if we're stepping through reverse
+ if self.desc:
+ queryset = queryset.order_by('-%s' % self.order_by)
+ else:
+ queryset = queryset.order_by(self.order_by)
+
+ # we implement basic cursor pagination for columns that are not unique
+ last_value = None
+ offset = 0
+ has_results = True
+ while ((max_value and cur_value <= max_value) or has_results) and (not self.limit or num < self.limit):
+ start = num
+
+ if cur_value is None:
+ results = queryset
+ elif self.desc:
+ results = queryset.filter(**{'%s__lte' % self.order_by: cur_value})
+ elif not self.desc:
+ results = queryset.filter(**{'%s__gte' % self.order_by: cur_value})
+
+ results = results[offset:offset + self.step].iterator()
+
+ for result in results:
+ yield result
+
+ num += 1
+ cur_value = getattr(result, self.order_by)
+ if cur_value == last_value:
+ offset += 1
+ else:
+ # offset needs to be based at 1 so we dont return a row
+ # that was already selected
+ last_value = cur_value
+ offset = 1
+
+ if (max_value and cur_value >= max_value) or (limit and num >= limit):
+ break
+
+ if cur_value is None:
+ break
+
+ has_results = num > start
View
14 tests/fixtures/cleanup.json
@@ -1,6 +1,7 @@
[
{
"fields": {
+ "project": 1,
"status": 0,
"first_seen": "2010-08-31 17:50:47Z",
"level": 30,
@@ -16,6 +17,7 @@
},
{
"fields": {
+ "project": 1,
"status": 0,
"first_seen": "2010-08-31 17:53:31Z",
"level": 40,
@@ -31,6 +33,7 @@
},
{
"fields": {
+ "project": 1,
"status": 0,
"first_seen": "2010-08-31 17:53:53Z",
"level": 40,
@@ -46,6 +49,7 @@
},
{
"fields": {
+ "project": 1,
"status": 0,
"first_seen": "2010-08-31 17:54:23Z",
"level": 40,
@@ -61,6 +65,7 @@
},
{
"fields": {
+ "project": 1,
"group": 2,
"server_name": "dcramer.local",
"level": 30,
@@ -76,6 +81,7 @@
},
{
"fields": {
+ "project": 1,
"group": 3,
"server_name": "dcramer.local",
"level": 40,
@@ -91,6 +97,7 @@
},
{
"fields": {
+ "project": 1,
"group": 3,
"server_name": "dcramer.local",
"level": 40,
@@ -106,6 +113,7 @@
},
{
"fields": {
+ "project": 1,
"group": 3,
"server_name": "dcramer.local",
"level": 40,
@@ -121,6 +129,7 @@
},
{
"fields": {
+ "project": 1,
"group": 4,
"server_name": "node.local",
"level": 40,
@@ -136,6 +145,7 @@
},
{
"fields": {
+ "project": 1,
"group": 4,
"server_name": "node.local",
"level": 40,
@@ -151,6 +161,7 @@
},
{
"fields": {
+ "project": 1,
"group": 4,
"server_name": "dcramer.local",
"level": 40,
@@ -166,6 +177,7 @@
},
{
"fields": {
+ "project": 1,
"group": 5,
"server_name": "dcramer.local",
"level": 40,
@@ -181,6 +193,7 @@
},
{
"fields": {
+ "project": 1,
"group": 5,
"server_name": "dcramer.local",
"level": 40,
@@ -196,6 +209,7 @@
},
{
"fields": {
+ "project": 1,
"group": 5,
"server_name": "dcramer.local",
"level": 40,
View
76 tests/sentry/tasks/cleanup/tests.py
@@ -2,14 +2,15 @@
from __future__ import absolute_import
-import logging
-
from celery.task import Task
-from sentry.models import Event, Group, MessageCountByMinute, \
- MessageFilterValue
+from sentry.models import (Event, Group, MessageCountByMinute,
+ MessageFilterValue, ProjectCountByMinute, FilterValue, FilterKey)
from sentry.tasks.cleanup import cleanup
from sentry.testutils import TestCase
+ALL_MODELS = (Event, Group, ProjectCountByMinute, MessageCountByMinute, MessageFilterValue,
+ FilterValue, FilterKey)
+
class SentryCleanupTest(TestCase):
fixtures = ['tests/fixtures/cleanup.json']
@@ -20,63 +21,20 @@ def test_is_task(self):
def test_simple(self):
cleanup(days=1)
- self.assertEquals(Event.objects.count(), 0)
- self.assertEquals(Group.objects.count(), 0)
- self.assertEquals(MessageCountByMinute.objects.count(), 0)
- self.assertEquals(MessageFilterValue.objects.count(), 0)
-
- def test_logger(self):
- cleanup(days=1, logger='sentry')
-
- self.assertEquals(Event.objects.count(), 8)
- for message in Event.objects.all():
- self.assertNotEquals(message.logger, 'sentry')
- self.assertEquals(Group.objects.count(), 3)
- for message in Group.objects.all():
- self.assertNotEquals(message.logger, 'sentry')
-
- cleanup(days=1, logger='awesome')
-
- self.assertEquals(Event.objects.count(), 4)
- for message in Event.objects.all():
- self.assertNotEquals(message.logger, 'awesome')
- self.assertEquals(Group.objects.count(), 2)
- for message in Group.objects.all():
- self.assertNotEquals(message.logger, 'awesome')
-
- cleanup(days=1, logger='root')
-
- self.assertEquals(Event.objects.count(), 0)
- self.assertEquals(Group.objects.count(), 0)
- self.assertEquals(MessageCountByMinute.objects.count(), 0)
- self.assertEquals(MessageFilterValue.objects.count(), 0)
-
- def test_server_name(self):
- cleanup(days=1, server='dcramer.local')
-
- self.assertEquals(Event.objects.count(), 2)
- for message in Event.objects.all():
- self.assertNotEquals(message.server_name, 'dcramer.local')
- self.assertEquals(Group.objects.count(), 1)
-
- cleanup(days=1, server='node.local')
+ for model in ALL_MODELS:
+ assert model.objects.count() == 0
- self.assertEquals(Event.objects.count(), 0)
- self.assertEquals(Group.objects.count(), 0)
- self.assertEquals(MessageCountByMinute.objects.count(), 0)
- self.assertEquals(MessageFilterValue.objects.count(), 0)
+ def test_project(self):
+ orig_counts = {}
+ for model in ALL_MODELS:
+ orig_counts[model] = model.objects.count()
- def test_level(self):
- cleanup(days=1, level=logging.ERROR)
+ cleanup(days=1, project=2)
- self.assertEquals(Event.objects.count(), 1)
- for message in Event.objects.all():
- self.assertNotEquals(message.level, logging.ERROR)
- self.assertEquals(Group.objects.count(), 1)
+ for model in ALL_MODELS:
+ assert model.objects.count() == orig_counts[model]
- cleanup(days=1, level=logging.DEBUG)
+ cleanup(days=1, project=1)
- self.assertEquals(Event.objects.count(), 0)
- self.assertEquals(Group.objects.count(), 0)
- self.assertEquals(MessageCountByMinute.objects.count(), 0)
- self.assertEquals(MessageFilterValue.objects.count(), 0)
+ for model in ALL_MODELS:
+ assert model.objects.count() == 0
Something went wrong with that request. Please try again.