From bf56d415e33e15dd74b565f3c5319274d8bb4082 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B8rgen=20Abrahamsen?= Date: Thu, 24 Apr 2014 14:54:46 +0200 Subject: [PATCH] Re-implement cleanup command. Remove thumbnails (file and database references) that no longer references an existing source image. --- .../management/commands/thumbnail_cleanup.py | 218 ++++++++++++------ 1 file changed, 147 insertions(+), 71 deletions(-) diff --git a/easy_thumbnails/management/commands/thumbnail_cleanup.py b/easy_thumbnails/management/commands/thumbnail_cleanup.py index aedff797..d9e1532a 100644 --- a/easy_thumbnails/management/commands/thumbnail_cleanup.py +++ b/easy_thumbnails/management/commands/thumbnail_cleanup.py @@ -1,76 +1,152 @@ +import gc import os -import re - -from django.db import models -from django.core.management.base import NoArgsCommand +import time +from datetime import datetime, date, timedelta +from optparse import make_option +from django.core.files.storage import get_storage_class +from django.core.management.base import BaseCommand from easy_thumbnails.conf import settings +from easy_thumbnails.models import Source + + +class ThumbnailCollectionCleaner(object): + """ + Remove thumbnails and DB references to non-existing source images. + """ + sources = 0 + thumbnails = 0 + thumbnails_deleted = 0 + source_refs_deleted = 0 + execution_time = 0 -try: - set -except NameError: - from sets import Set as set # For Python 2.3 - -thumb_re = re.compile( - r'^%s(.*)\.\d{1,}x\d{1,}_[-\w]*q([1-9]\d?|100)\.jpg' % - settings.THUMBNAIL_PREFIX) - - -def get_thumbnail_path(path): - basedir = settings.THUMBNAIL_BASEDIR - subdir = settings.THUMBNAIL_SUBDIR - return os.path.join(basedir, path, subdir) - - -def clean_up(): - paths = set() - for app in models.get_apps(): - model_list = models.get_models(app) - for model in model_list: - for field in model._meta.fields: - if isinstance(field, models.ImageField): - #TODO: take care of date formatted and callable upload_to. - if (not callable(field.upload_to) and - field.upload_to.find("%") == -1): - paths = paths.union((field.upload_to,)) - paths = list(paths) - for path in paths: - thumbnail_path = get_thumbnail_path(path) + def _get_absolute_path(self, path): + return os.path.join(settings.MEDIA_ROOT, path) + + def _get_relative_path(self, path): + return os.path.relpath(path, settings.MEDIA_ROOT) + + def _check_if_exists(self, storage, path): try: - file_list = os.listdir(os.path.join(settings.MEDIA_ROOT, - thumbnail_path)) - except OSError: - continue # Dir doesn't exists, no thumbnails here. - for fn in file_list: - m = thumb_re.match(fn) - if m: - # Due to that the naming of thumbnails replaces the dot before - # extension with an underscore we have 2 possibilities for the - # original filename. If either present we do not delete - # suspected thumbnail. - # org_fn is the expected original filename w/o extension - # org_fn_alt is the expected original filename with extension - org_fn = m.group(1) - org_fn_exists = os.path.isfile( - os.path.join(settings.MEDIA_ROOT, path, org_fn)) - - usc_pos = org_fn.rfind("_") - if usc_pos != -1: - org_fn_alt = "%s.%s" % ( - org_fn[0:usc_pos], org_fn[usc_pos + 1:]) - org_fn_alt_exists = os.path.isfile( - os.path.join(settings.MEDIA_ROOT, path, org_fn_alt)) - else: - org_fn_alt_exists = False - if not org_fn_exists and not org_fn_alt_exists: - del_me = os.path.join(settings.MEDIA_ROOT, - thumbnail_path, fn) - os.remove(del_me) - - -class Command(NoArgsCommand): - help = "Deletes thumbnails that no longer have an original file." - requires_model_validation = False - - def handle_noargs(self, **options): - clean_up() + return storage.exists(path) + except Exception as e: + print "Something went wrong when checking existance of %s:" % path + print str(e) + + def _delete_sources_by_id(self, ids): + Source.objects.all().filter(id__in=ids).delete() + + def clean_up(self, dry_run=False, verbosity=1, last_n_days=0, + cleanup_path=None, storage=None): + """ + Iterate through sources. Delete database references to sources + not existing, including its corresponding thumbnails (files and + database references). + """ + if dry_run: + print "Dry run..." + + if not storage: + storage = get_storage_class(settings.THUMBNAIL_DEFAULT_STORAGE)() + + sources_to_delete = [] + time_start = time.time() + + query = Source.objects.all() + if last_n_days > 0: + today = date.today() + query = query.filter( + modified__range=(today - timedelta(days=last_n_days), today)) + if cleanup_path: + query = query.filter(name__startswith=cleanup_path) + + for source in queryset_iterator(query): + self.sources += 1 + abs_source_path = self._get_absolute_path(source.name) + + if not self._check_if_exists(storage, abs_source_path): + if verbosity > 0: + print "Source not present:", abs_source_path + self.source_refs_deleted += 1 + sources_to_delete.append(source.id) + + for thumb in source.thumbnails.all(): + self.thumbnails_deleted += 1 + abs_thumbnail_path = self._get_absolute_path(thumb.name) + + if self._check_if_exists(storage, abs_thumbnail_path): + if not dry_run: + storage.delete(abs_thumbnail_path) + if verbosity > 0: + print "Deleting thumbnail:", abs_thumbnail_path + + if len(sources_to_delete) >= 1000 and not dry_run: + self._delete_sources_by_id(sources_to_delete) + sources_to_delete = [] + + if not dry_run: + self._delete_sources_by_id(sources_to_delete) + self.execution_time = round(time.time() - time_start) + + def print_stats(self): + """ + Print statistics about the cleanup performed. + """ + print '{:-<48}'.format(str(datetime.now().strftime('%Y-%m-%d %H:%M '))) + print "{:<40} {:>7}".format("Sources checked:", self.sources) + print "{:<40} {:>7}".format("Source references deleted from DB:", + self.source_refs_deleted) + print "{:<40} {:>7}".format("Thumbnails deleted from disk:", + self.thumbnails_deleted) + print "(Completed in %s seconds)\n" % self.execution_time + + +def queryset_iterator(queryset, chunksize=1000): + """ + The queryset iterator helps to keep the memory consumption down. + And also making it easier to process for weaker computers. + """ + + primary_key = 0 + last_pk = queryset.order_by('-pk')[0].pk + queryset = queryset.order_by('pk') + while primary_key < last_pk: + for row in queryset.filter(pk__gt=primary_key)[:chunksize]: + primary_key = row.pk + yield row + gc.collect() + + +class Command(BaseCommand): + help = """ Deletes thumbnails that no longer have an original file. """ + + option_list = BaseCommand.option_list + ( + make_option( + '--dry-run', + action='store_true', + dest='dry_run', + default=False, + help='Dry run the execution.'), + make_option( + '--last-n-days', + action='store', + dest='last_n_days', + default=0, + type='int', + help='The number of days back in time to clean thumbnails for.'), + make_option( + '--path', + action='store', + dest='cleanup_path', + type='string', + help='Specify a path to clean up.'), + ) + + def handle(self, *args, **options): + tcc = ThumbnailCollectionCleaner() + tcc.clean_up( + dry_run=options.get('dry_run', False), + verbosity=int(options.get('verbosity', 1)), + last_n_days=int(options.get('last_n_days', 0)), + cleanup_path=options.get('cleanup_path')) + tcc.print_stats()