Skip to content

Commit

Permalink
Conflict
Browse files Browse the repository at this point in the history
  • Loading branch information
palewire committed Apr 25, 2017
2 parents 70c893f + e2b1753 commit f529339
Show file tree
Hide file tree
Showing 8 changed files with 129 additions and 73 deletions.
5 changes: 2 additions & 3 deletions .travis.yml
Expand Up @@ -15,9 +15,8 @@ env:
- AWS_ACCESS_KEY_ID="MOCK_ACCESS_KEY_ID"
- AWS_SECRET_ACCESS_KEY="MOCK_SECRET_ACCESS_KEY"
matrix:
- DJANGO_VERSION=1.8.17
- DJANGO_VERSION=1.9.12
- DJANGO_VERSION=1.10.6
- DJANGO_VERSION=1.8.18
- DJANGO_VERSION=1.10.7
- DJANGO_VERSION=1.11

install:
Expand Down
6 changes: 5 additions & 1 deletion Makefile
@@ -1,4 +1,8 @@
.PHONY: ship
.PHONY: test ship

test:
flake8 bakery
python setup.py test

ship:
python setup.py sdist bdist_wheel
Expand Down
2 changes: 2 additions & 0 deletions bakery/management/commands/__init__.py
Expand Up @@ -45,6 +45,8 @@ def get_all_objects_in_bucket(
Little utility method that handles pagination and returns
all objects in given bucket.
"""
logger.debug("Retrieving bucket object list")

if not s3_client:
s3_client, s3_resource = get_s3_client()

Expand Down
10 changes: 5 additions & 5 deletions bakery/management/commands/build.py
Expand Up @@ -58,11 +58,11 @@ def add_arguments(self, parser):
help="Skip collecting the media files when building."
)
parser.add_argument(
"--no-pooling",
"--pooling",
action="store_true",
dest="no_pooling",
dest="pooling",
default=False,
help=("Run builds one by one rather than pooling them to run concurrently.")
help=("Pool builds to run concurrently rather than running them one by one.")
)

def handle(self, *args, **options):
Expand Down Expand Up @@ -115,7 +115,7 @@ def set_options(self, *args, **options):
raise CommandError(self.views_unconfig_msg)
self.view_list = settings.BAKERY_VIEWS

self.no_pooling = options.get('no_pooling')
self.pooling = options.get('pooling')

def init_build_dir(self):
"""
Expand Down Expand Up @@ -219,7 +219,7 @@ def copytree_and_gzip(self, source_dir, target_dir):
build_list.append((source_path, target_path))

# Build em all
if getattr(self, 'no_pooling', False):
if not getattr(self, 'pooling', False):
[self.copyfile_and_gzip(*u) for u in build_list]
else:
cpu_count = multiprocessing.cpu_count()
Expand Down
153 changes: 100 additions & 53 deletions bakery/management/commands/publish.py
Expand Up @@ -3,6 +3,7 @@
import hashlib
import logging
import mimetypes
import multiprocessing
from django.conf import settings
from multiprocessing.pool import ThreadPool
from bakery import DEFAULT_GZIP_CONTENT_TYPES
Expand Down Expand Up @@ -86,30 +87,53 @@ def handle(self, *args, **options):
self.set_options(options)

# Initialize the boto connection
logger.debug("Connecting to s3")
if self.verbosity > 2:
self.stdout.write("Connecting to s3")
self.s3_client, self.s3_resource = get_s3_client()

# Grab our bucket
logger.debug("Retriving bucket {}".format(self.aws_bucket_name))
if self.verbosity > 2:
self.stdout.write("Retriving bucket {}".format(self.aws_bucket_name))
self.bucket = self.s3_resource.Bucket(self.aws_bucket_name)

# Get a list of all keys in our s3 bucket
self.s3_obj_dict = self.get_all_objects_in_bucket(
self.aws_bucket_name,
self.s3_client
)
# Get a list of all keys in our s3 bucket ...
# ...nunless you're this is case where we're blindly pushing
if self.force_publish and self.no_delete:
self.blind_upload = True
logger.debug("Skipping object retrieval. We won't need to because we're blinding uploading everything.")
self.s3_obj_dict = {}
else:
self.blind_upload = False
logger.debug("Retrieving objects now published in bucket")
if self.verbosity > 2:
self.stdout.write("Retrieving objects now published in bucket")
self.s3_obj_dict = self.get_all_objects_in_bucket(
self.aws_bucket_name,
self.s3_client
)

# Get a list of all the local files in our build directory
logger.debug("Retrieving files built locally")
if self.verbosity > 2:
self.stdout.write("Retrieving files built locally")
self.local_file_list = self.get_local_file_list()

# Sync the two
# Sync local files with s3 bucket
logger.debug("Syncing local files with bucket")
if self.verbosity > 2:
self.stdout.write("Syncing local files with bucket")
self.sync_with_s3()

# Delete anything that's left in our keys dict
if not self.dry_run and not self.no_delete:
self.deleted_file_list = list(self.s3_obj_dict.keys())
self.deleted_files = len(self.deleted_file_list)
if self.deleted_files:
logger.debug("Deleting %s keys" % self.deleted_files)
if self.verbosity > 0:
logger.debug("deleting %s keys" % self.deleted_files)
self.stdout.write("Deleting %s keys" % self.deleted_files)
self.batch_delete_s3_objects(
self.deleted_file_list,
self.aws_bucket_name
Expand All @@ -125,23 +149,19 @@ def handle(self, *args, **options):

# We're finished, print the final output
elapsed_time = time.time() - self.start_time
msg = "Publish completed, %d uploaded and %d deleted files in %.2f seconds" % (
self.uploaded_files,
self.deleted_files,
elapsed_time
)
logger.info(msg)
if self.verbosity > 0:
msg = "publish completed, %d uploaded and %d deleted files in %.2f seconds" % (
self.uploaded_files,
self.deleted_files,
elapsed_time
)
self.stdout.write(msg)
logger.info(msg)

if self.verbosity > 2:
for f in self.uploaded_file_list:
logger.info("updated file: %s" % f)
for f in self.deleted_file_list:
logger.info("deleted file: %s" % f)

if self.dry_run:
logger.info("publish executed with the --dry-run option. No content was changed on S3.")
logger.info("Publish executed with the --dry-run option. No content was changed on S3.")
if self.verbosity > 0:
self.stdout.write("Publish executed with the --dry-run option. No content was changed on S3.")

def set_options(self, options):
"""
Expand Down Expand Up @@ -202,7 +222,7 @@ def set_options(self, options):
if options.get('dry_run'):
self.dry_run = True
if self.verbosity > 0:
logger.info("executing with the --dry-run option set.")
logger.info("Executing with the --dry-run option set.")
else:
self.dry_run = False

Expand Down Expand Up @@ -233,40 +253,66 @@ def sync_with_s3(self):
of keys in the S3 bucket.
"""
# Create a list to put all the files we're going to update
update_list = []

for file_key in self.local_file_list:
# store a reference to the absolute path, if we have to open it
abs_file_path = os.path.join(self.build_dir, file_key)

# check if the file exists
if file_key in self.s3_obj_dict:
s3_etag = self.s3_obj_dict[file_key].get('ETag').strip('"')
local_md5 = hashlib.md5(
open(abs_file_path, "rb").read()
).hexdigest()

# don't upload if the md5 sums are the same
if s3_etag == local_md5 and not self.force_publish:
pass
elif self.force_publish:
update_list.append((file_key, abs_file_path))
else:
update_list.append((file_key, abs_file_path))

# remove the file from the dict, we don't need it anymore
del self.s3_obj_dict[file_key]

# if the file doesn't exist, create it
else:
update_list.append((file_key, abs_file_path))
self.update_list = []

# Upload all these files
# Figure out which files need to be updated and upload all these files
logger.debug("Comparing {} local files with bucket".format(len(self.local_file_list)))
if self.no_pooling:
[self.upload_to_s3(*u) for u in update_list]
[self.compare_local_file(f) for f in self.local_file_list]
else:
cpu_count = multiprocessing.cpu_count()
logger.debug("Pooling local file comparison on {} CPUs".format(cpu_count))
pool = ThreadPool(processes=cpu_count)
pool.map(self.compare_local_file, self.local_file_list)

logger.debug("Uploading {} new or updated files to bucket".format(len(self.update_list)))
if self.no_pooling:
[self.upload_to_s3(*u) for u in self.update_list]
else:
logger.debug("Pooling s3 uploads on {} CPUs".format(cpu_count))
pool = ThreadPool(processes=cpu_count)
pool.map(self.pooled_upload_to_s3, self.update_list)

def compare_local_file(self, file_key):
"""
Compares a local version of a file with what's already published.
If an update is needed, the file's key is added self.update_list.
"""
# Where is the file?
file_path = os.path.join(self.build_dir, file_key)

# If we're in force_publish mode just add it
if self.force_publish:
self.update_list.append((file_key, file_path))
# And quit now
return

# Does it exist in our s3 object list?
if file_key in self.s3_obj_dict:

# If it does, open up the local file and convert it to a hexdigest
local_data = open(file_path, "rb").read()
local_md5 = hashlib.md5(local_data).hexdigest()

# Now lets compare it to the hexdigest of what's on s3
s3_md5 = self.s3_obj_dict[file_key].get('ETag').strip('"')

# If their md5 hexdigests match, do nothing
if s3_md5 == local_md5:
pass
# If they don't match, we want to add it
else:
logger.debug("{} has changed".format(file_key))
self.update_list.append((file_key, file_path))

# Remove the file from the s3 dict, we don't need it anymore
del self.s3_obj_dict[file_key]

# If the file doesn't exist, queue it for creation
else:
pool = ThreadPool(processes=10)
pool.map(self.pooled_upload_to_s3, update_list)
logger.debug("{} has been added".format(file_key))
self.update_list.append((file_key, file_path))

def pooled_upload_to_s3(self, payload):
"""
Expand Down Expand Up @@ -302,8 +348,9 @@ def upload_to_s3(self, key, filename):

# access and write the contents from the file
if not self.dry_run:
logger.debug("Uploading %s" % filename)
if self.verbosity > 0:
logger.debug("uploading %s" % filename)
self.stdout.write("Uploading %s" % filename)
s3_obj = self.s3_resource.Object(self.aws_bucket_name, key)
s3_obj.upload_file(filename, ExtraArgs=extra_args)
self.uploaded_files += 1
Expand Down
14 changes: 7 additions & 7 deletions bakery/tests/__init__.py
Expand Up @@ -268,8 +268,6 @@ def test_redirect_view(self):
"detail/badurl.html"
)
self.assertTrue(os.path.exists(build_path))
# with mock_s3():
# MockRedirectView().post_publish(settings.AWS_BUCKET_NAME)

def test_404_view(self):
v = views.Buildable404View()
Expand Down Expand Up @@ -358,7 +356,7 @@ def test_publish_cmd(self):
with mock_s3():
self._create_bucket()
call_command("build")
call_command("publish", no_pooling=True, verbosity=3)
call_command("publish", verbosity=3)
local_file_list = []
for (dirpath, dirnames, filenames) in os.walk(
settings.BUILD_DIR):
Expand All @@ -375,13 +373,15 @@ def test_publish_cmd(self):
self.assertIn(obj.get('Key'), local_file_list)
call_command("unbuild")
os.makedirs(settings.BUILD_DIR)
call_command("publish", no_pooling=True, verbosity=3)
call_command("publish", verbosity=3)

call_command("publish", no_delete=True, force_publish=True)

def test_unpublish_cmd(self):
with mock_s3():
self._create_bucket()
call_command("build")
call_command("unpublish", no_pooling=True, verbosity=3)
call_command("unpublish", verbosity=3)
self.assertFalse(self._get_bucket_objects())

# def test_tasks(self):
Expand Down Expand Up @@ -418,7 +418,7 @@ def test_cache_control(self):
}):
self._create_bucket()
call_command("build")
call_command("publish", no_pooling=True, verbosity=3)
call_command("publish", verbosity=3)

for obj in self._get_bucket_objects():
s3_obj = s3.Object(
Expand All @@ -443,7 +443,7 @@ def test_batch_unpublish(self):
obj = s3.Object(settings.AWS_BUCKET_NAME, key)
obj.put('This is test object %s' % i)
keys.append(key)
call_command("unpublish", no_pooling=True, verbosity=3)
call_command("unpublish", verbosity=3)
self.assertFalse(self._get_bucket_objects())

def test_get_s3_client_honors_settings_over_environ(self):
Expand Down
7 changes: 7 additions & 0 deletions docs/changelog.rst
@@ -1,6 +1,13 @@
Changelog
=========

0.10.0
------

* Default pooling of file comparisons between published and local files for faster performance
* Option to opt-in to pooling of building of files locally for faster performance
* When ``--force`` and ``--no-delete`` options are both passed to publish command the s3 object list is not retrieved for faster performance

0.9.3
-----

Expand Down
5 changes: 1 addition & 4 deletions setup.py
Expand Up @@ -51,9 +51,6 @@ def run(self):
},
],
BUILD_DIR = tempfile.mkdtemp(),
BAKERY_BUILD_ON_S3 = True,
BAKERY_AWS_BUILD_BUCKET = 'build-bucket',
BAKERY_AWS_PUBLISH_BUCKET = 'publish-bucket',
STATIC_ROOT = os.path.abspath(
os.path.join(
os.path.dirname(__file__),
Expand Down Expand Up @@ -91,7 +88,7 @@ def run(self):

setup(
name='django-bakery',
version='0.10.0-rc.5',
version='0.10.1',
description='A set of helpers for baking your Django site out as flat files',
author='The Los Angeles Times Data Desk',
author_email='datadesk@latimes.com',
Expand Down

0 comments on commit f529339

Please sign in to comment.