Skip to content

Commit

Permalink
Merge pull request SmileyChris#312 from jorabra/new-cleanup-command
Browse files Browse the repository at this point in the history
Re-implement cleanup command.
  • Loading branch information
SmileyChris committed Apr 24, 2014
2 parents eb1673d + bf56d41 commit 7a03e7d
Showing 1 changed file with 147 additions and 71 deletions.
218 changes: 147 additions & 71 deletions easy_thumbnails/management/commands/thumbnail_cleanup.py
@@ -1,76 +1,152 @@
import gc
import os
import re

from django.db import models
from django.core.management.base import NoArgsCommand
import time
from datetime import datetime, date, timedelta
from optparse import make_option

from django.core.files.storage import get_storage_class
from django.core.management.base import BaseCommand
from easy_thumbnails.conf import settings
from easy_thumbnails.models import Source


class ThumbnailCollectionCleaner(object):
"""
Remove thumbnails and DB references to non-existing source images.
"""
sources = 0
thumbnails = 0
thumbnails_deleted = 0
source_refs_deleted = 0
execution_time = 0

try:
set
except NameError:
from sets import Set as set # For Python 2.3

thumb_re = re.compile(
r'^%s(.*)\.\d{1,}x\d{1,}_[-\w]*q([1-9]\d?|100)\.jpg' %
settings.THUMBNAIL_PREFIX)


def get_thumbnail_path(path):
basedir = settings.THUMBNAIL_BASEDIR
subdir = settings.THUMBNAIL_SUBDIR
return os.path.join(basedir, path, subdir)


def clean_up():
paths = set()
for app in models.get_apps():
model_list = models.get_models(app)
for model in model_list:
for field in model._meta.fields:
if isinstance(field, models.ImageField):
#TODO: take care of date formatted and callable upload_to.
if (not callable(field.upload_to) and
field.upload_to.find("%") == -1):
paths = paths.union((field.upload_to,))
paths = list(paths)
for path in paths:
thumbnail_path = get_thumbnail_path(path)
def _get_absolute_path(self, path):
return os.path.join(settings.MEDIA_ROOT, path)

def _get_relative_path(self, path):
return os.path.relpath(path, settings.MEDIA_ROOT)

def _check_if_exists(self, storage, path):
try:
file_list = os.listdir(os.path.join(settings.MEDIA_ROOT,
thumbnail_path))
except OSError:
continue # Dir doesn't exists, no thumbnails here.
for fn in file_list:
m = thumb_re.match(fn)
if m:
# Due to that the naming of thumbnails replaces the dot before
# extension with an underscore we have 2 possibilities for the
# original filename. If either present we do not delete
# suspected thumbnail.
# org_fn is the expected original filename w/o extension
# org_fn_alt is the expected original filename with extension
org_fn = m.group(1)
org_fn_exists = os.path.isfile(
os.path.join(settings.MEDIA_ROOT, path, org_fn))

usc_pos = org_fn.rfind("_")
if usc_pos != -1:
org_fn_alt = "%s.%s" % (
org_fn[0:usc_pos], org_fn[usc_pos + 1:])
org_fn_alt_exists = os.path.isfile(
os.path.join(settings.MEDIA_ROOT, path, org_fn_alt))
else:
org_fn_alt_exists = False
if not org_fn_exists and not org_fn_alt_exists:
del_me = os.path.join(settings.MEDIA_ROOT,
thumbnail_path, fn)
os.remove(del_me)


class Command(NoArgsCommand):
help = "Deletes thumbnails that no longer have an original file."
requires_model_validation = False

def handle_noargs(self, **options):
clean_up()
return storage.exists(path)
except Exception as e:
print "Something went wrong when checking existance of %s:" % path
print str(e)

def _delete_sources_by_id(self, ids):
Source.objects.all().filter(id__in=ids).delete()

def clean_up(self, dry_run=False, verbosity=1, last_n_days=0,
cleanup_path=None, storage=None):
"""
Iterate through sources. Delete database references to sources
not existing, including its corresponding thumbnails (files and
database references).
"""
if dry_run:
print "Dry run..."

if not storage:
storage = get_storage_class(settings.THUMBNAIL_DEFAULT_STORAGE)()

sources_to_delete = []
time_start = time.time()

query = Source.objects.all()
if last_n_days > 0:
today = date.today()
query = query.filter(
modified__range=(today - timedelta(days=last_n_days), today))
if cleanup_path:
query = query.filter(name__startswith=cleanup_path)

for source in queryset_iterator(query):
self.sources += 1
abs_source_path = self._get_absolute_path(source.name)

if not self._check_if_exists(storage, abs_source_path):
if verbosity > 0:
print "Source not present:", abs_source_path
self.source_refs_deleted += 1
sources_to_delete.append(source.id)

for thumb in source.thumbnails.all():
self.thumbnails_deleted += 1
abs_thumbnail_path = self._get_absolute_path(thumb.name)

if self._check_if_exists(storage, abs_thumbnail_path):
if not dry_run:
storage.delete(abs_thumbnail_path)
if verbosity > 0:
print "Deleting thumbnail:", abs_thumbnail_path

if len(sources_to_delete) >= 1000 and not dry_run:
self._delete_sources_by_id(sources_to_delete)
sources_to_delete = []

if not dry_run:
self._delete_sources_by_id(sources_to_delete)
self.execution_time = round(time.time() - time_start)

def print_stats(self):
"""
Print statistics about the cleanup performed.
"""
print '{:-<48}'.format(str(datetime.now().strftime('%Y-%m-%d %H:%M ')))
print "{:<40} {:>7}".format("Sources checked:", self.sources)
print "{:<40} {:>7}".format("Source references deleted from DB:",
self.source_refs_deleted)
print "{:<40} {:>7}".format("Thumbnails deleted from disk:",
self.thumbnails_deleted)
print "(Completed in %s seconds)\n" % self.execution_time


def queryset_iterator(queryset, chunksize=1000):
"""
The queryset iterator helps to keep the memory consumption down.
And also making it easier to process for weaker computers.
"""

primary_key = 0
last_pk = queryset.order_by('-pk')[0].pk
queryset = queryset.order_by('pk')
while primary_key < last_pk:
for row in queryset.filter(pk__gt=primary_key)[:chunksize]:
primary_key = row.pk
yield row
gc.collect()


class Command(BaseCommand):
help = """ Deletes thumbnails that no longer have an original file. """

option_list = BaseCommand.option_list + (
make_option(
'--dry-run',
action='store_true',
dest='dry_run',
default=False,
help='Dry run the execution.'),
make_option(
'--last-n-days',
action='store',
dest='last_n_days',
default=0,
type='int',
help='The number of days back in time to clean thumbnails for.'),
make_option(
'--path',
action='store',
dest='cleanup_path',
type='string',
help='Specify a path to clean up.'),
)

def handle(self, *args, **options):
tcc = ThumbnailCollectionCleaner()
tcc.clean_up(
dry_run=options.get('dry_run', False),
verbosity=int(options.get('verbosity', 1)),
last_n_days=int(options.get('last_n_days', 0)),
cleanup_path=options.get('cleanup_path'))
tcc.print_stats()

0 comments on commit 7a03e7d

Please sign in to comment.